1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to)
27 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 Idx str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 Idx node, Idx str_idx)
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, Idx last_node,
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, Idx length,
40 Idx start, regoff_t range, Idx stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, Idx length1,
45 const char *string2, Idx length2,
46 Idx start, regoff_t range,
47 struct re_registers *regs,
48 Idx stop, int ret_len) internal_function;
49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, Idx length, Idx start,
51 regoff_t range, Idx stop,
52 struct re_registers *regs,
53 int ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 Idx nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
58 static Idx check_matching (re_match_context_t *mctx, int fl_longest_match,
61 static Idx check_halt_state_context (const re_match_context_t *mctx,
62 const re_dfastate_t *state, Idx idx)
64 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
65 regmatch_t *prev_idx_match, Idx cur_node,
66 Idx cur_idx, Idx nmatch) internal_function;
67 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
68 Idx str_idx, Idx dest_node, Idx nregs,
70 re_node_set *eps_via_nodes) internal_function;
71 static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 int fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
81 #endif /* RE_ENABLE_I18N */
82 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
83 re_sift_context_t *sctx) internal_function;
84 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
85 re_sift_context_t *sctx, Idx str_idx,
86 re_node_set *cur_dest) internal_function;
87 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
88 re_sift_context_t *sctx,
90 re_node_set *dest_nodes) internal_function;
91 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
92 re_node_set *dest_nodes,
93 const re_node_set *candidates) internal_function;
94 static int check_dst_limits (const re_match_context_t *mctx,
95 const re_node_set *limits,
96 Idx dst_node, Idx dst_idx, Idx src_node,
97 Idx src_idx) internal_function;
98 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
99 int boundaries, Idx subexp_idx,
100 Idx from_node, Idx bkref_idx) internal_function;
101 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
102 Idx limit, Idx subexp_idx,
103 Idx node, Idx str_idx,
104 Idx bkref_idx) internal_function;
105 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
106 re_node_set *dest_nodes,
107 const re_node_set *candidates,
109 struct re_backref_cache_entry *bkref_ents,
110 Idx str_idx) internal_function;
111 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
112 re_sift_context_t *sctx,
113 Idx str_idx, const re_node_set *candidates) internal_function;
114 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
115 re_dfastate_t **src, Idx num) internal_function;
116 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
117 re_match_context_t *mctx) internal_function;
118 static re_dfastate_t *transit_state (reg_errcode_t *err,
119 re_match_context_t *mctx,
120 re_dfastate_t *state) internal_function;
121 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
122 re_match_context_t *mctx,
123 re_dfastate_t *next_state) internal_function;
124 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
125 re_node_set *cur_nodes,
126 Idx str_idx) internal_function;
128 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *pstate) internal_function;
132 #ifdef RE_ENABLE_I18N
133 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
134 re_dfastate_t *pstate) internal_function;
135 #endif /* RE_ENABLE_I18N */
136 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
137 const re_node_set *nodes) internal_function;
138 static reg_errcode_t get_subexp (re_match_context_t *mctx,
139 Idx bkref_node, Idx bkref_str_idx) internal_function;
140 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
141 const re_sub_match_top_t *sub_top,
142 re_sub_match_last_t *sub_last,
143 Idx bkref_node, Idx bkref_str) internal_function;
144 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
145 Idx subexp_idx, int type) internal_function;
146 static reg_errcode_t check_arrival (re_match_context_t *mctx,
147 state_array_t *path, Idx top_node,
148 Idx top_str, Idx last_node, Idx last_str,
149 int type) internal_function;
150 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
152 re_node_set *cur_nodes,
153 re_node_set *next_nodes) internal_function;
154 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
155 re_node_set *cur_nodes,
156 Idx ex_subexp, int type) internal_function;
157 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
158 re_node_set *dst_nodes,
159 Idx target, Idx ex_subexp,
160 int type) internal_function;
161 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
162 re_node_set *cur_nodes, Idx cur_str,
163 Idx subexp_num, int type) internal_function;
164 static int build_trtable (re_dfa_t *dfa,
165 re_dfastate_t *state) internal_function;
166 #ifdef RE_ENABLE_I18N
167 static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
168 const re_string_t *input, Idx idx) internal_function;
170 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
171 size_t name_len) internal_function;
173 #endif /* RE_ENABLE_I18N */
174 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
175 const re_dfastate_t *state,
176 re_node_set *states_node,
177 bitset *states_ch) internal_function;
178 static int check_node_accept (const re_match_context_t *mctx,
179 const re_token_t *node, Idx idx) internal_function;
180 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
182 /* Entry point for POSIX code. */
184 /* regexec searches for a given pattern, specified by PREG, in the
187 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
188 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
189 least NMATCH elements, and we set them to the offsets of the
190 corresponding matched substrings.
192 EFLAGS specifies `execution flags' which affect matching: if
193 REG_NOTBOL is set, then ^ does not match at the beginning of the
194 string; if REG_NOTEOL is set, then $ does not match at the end.
196 We return 0 if we find a match and REG_NOMATCH if not. */
199 regexec (const regex_t *__restrict preg, const char *__restrict string,
200 size_t nmatch, regmatch_t pmatch[], int eflags)
205 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
208 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
211 if (eflags & REG_STARTEND)
213 start = pmatch[0].rm_so;
214 length = pmatch[0].rm_eo;
219 length = strlen (string);
222 __libc_lock_lock (dfa->lock);
224 err = re_search_internal (preg, string, length, start, length - start,
225 length, 0, NULL, eflags);
227 err = re_search_internal (preg, string, length, start, length - start,
228 length, nmatch, pmatch, eflags);
229 __libc_lock_unlock (dfa->lock);
230 return err != REG_NOERROR;
234 # include <shlib-compat.h>
235 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
237 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
238 __typeof__ (__regexec) __compat_regexec;
241 attribute_compat_text_section
242 __compat_regexec (const regex_t *__restrict preg,
243 const char *__restrict string, size_t nmatch,
244 regmatch_t pmatch[], int eflags)
246 return regexec (preg, string, nmatch, pmatch,
247 eflags & (REG_NOTBOL | REG_NOTEOL));
249 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
253 /* Entry points for GNU code. */
255 /* re_match, re_search, re_match_2, re_search_2
257 The former two functions operate on STRING with length LENGTH,
258 while the later two operate on concatenation of STRING1 and STRING2
259 with lengths LENGTH1 and LENGTH2, respectively.
261 re_match() matches the compiled pattern in BUFP against the string,
262 starting at index START.
264 re_search() first tries matching at index START, then it tries to match
265 starting from index START + 1, and so on. The last start position tried
266 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
269 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
270 the first STOP characters of the concatenation of the strings should be
273 If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
274 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
275 computed relative to the concatenation, not relative to the individual
278 On success, re_match* functions return the length of the match, re_search*
279 return the position of the start of the match. Return value -1 means no
280 match was found and -2 indicates an internal error. */
283 re_match (struct re_pattern_buffer *bufp, const char *string,
284 Idx length, Idx start, struct re_registers *regs)
286 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
289 weak_alias (__re_match, re_match)
293 re_search (struct re_pattern_buffer *bufp, const char *string,
294 Idx length, Idx start, regoff_t range, struct re_registers *regs)
296 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
299 weak_alias (__re_search, re_search)
303 re_match_2 (struct re_pattern_buffer *bufp,
304 const char *string1, Idx length1,
305 const char *string2, Idx length2,
306 Idx start, struct re_registers *regs, Idx stop)
308 return re_search_2_stub (bufp, string1, length1, string2, length2,
309 start, 0, regs, stop, 1);
312 weak_alias (__re_match_2, re_match_2)
316 re_search_2 (struct re_pattern_buffer *bufp,
317 const char *string1, Idx length1,
318 const char *string2, Idx length2,
319 Idx start, regoff_t range, struct re_registers *regs, Idx stop)
321 return re_search_2_stub (bufp, string1, length1, string2, length2,
322 start, range, regs, stop, 0);
325 weak_alias (__re_search_2, re_search_2)
330 re_search_2_stub (struct re_pattern_buffer *bufp,
331 const char *string1, Idx length1,
332 const char *string2, Idx length2,
333 Idx start, regoff_t range, struct re_registers *regs,
334 Idx stop, int ret_len)
338 Idx len = length1 + length2;
341 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
344 /* Concatenate the strings. */
348 char *s = re_malloc (char, len);
350 if (BE (s == NULL, 0))
352 memcpy (s, string1, length1);
353 memcpy (s + length1, string2, length2);
362 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
365 re_free ((char *) str);
369 /* The parameters have the same meaning as those of re_search.
370 Additional parameters:
371 If RET_LEN is nonzero the length of the match is returned (re_match style);
372 otherwise the position of the match is returned. */
376 re_search_stub (struct re_pattern_buffer *bufp,
377 const char *string, Idx length,
378 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
381 reg_errcode_t result;
387 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
390 /* Check for out-of-range. */
391 if (BE (start < 0 || start > length, 0))
393 if (sizeof start < sizeof range)
395 regoff_t length_offset = length;
396 regoff_t start_offset = start;
397 if (BE (length_offset - start_offset < range, 0))
398 range = length_offset - start_offset;
399 else if (BE (range < - start_offset, 0))
400 range = -start_offset;
404 if (BE (start + range > length, 0))
405 range = length - start;
406 else if (BE (start + range < 0, 0))
410 __libc_lock_lock (dfa->lock);
412 eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
413 eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
415 /* Compile fastmap if we haven't yet. */
416 if (range > 0 && bufp->re_fastmap != NULL && !bufp->re_fastmap_accurate)
417 re_compile_fastmap (bufp);
419 if (BE (bufp->re_no_sub, 0))
422 /* We need at least 1 register. */
425 else if (BE (bufp->re_regs_allocated == REG_FIXED
426 && regs->rm_num_regs < bufp->re_nsub + 1, 0))
428 nregs = regs->rm_num_regs;
429 if (BE (nregs < 1, 0))
431 /* Nothing can be copied to regs. */
437 nregs = bufp->re_nsub + 1;
438 pmatch = re_malloc (regmatch_t, nregs);
439 if (BE (pmatch == NULL, 0))
445 result = re_search_internal (bufp, string, length, start, range, stop,
446 nregs, pmatch, eflags);
450 /* I hope we needn't fill ther regs with -1's when no match was found. */
451 if (result != REG_NOERROR)
453 else if (regs != NULL)
455 /* If caller wants register contents data back, copy them. */
456 bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
457 bufp->re_regs_allocated);
458 if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
462 if (BE (rval == 0, 1))
466 assert (pmatch[0].rm_so == start);
467 rval = pmatch[0].rm_eo - start;
470 rval = pmatch[0].rm_so;
474 __libc_lock_unlock (dfa->lock);
480 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
483 int rval = REG_REALLOCATE;
485 Idx need_regs = nregs + 1;
486 /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
489 /* Have the register data arrays been allocated? */
490 if (regs_allocated == REG_UNALLOCATED)
491 { /* No. So allocate them with malloc. */
492 regs->rm_start = re_malloc (regoff_t, need_regs);
493 regs->rm_end = re_malloc (regoff_t, need_regs);
494 if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
495 return REG_UNALLOCATED;
496 regs->rm_num_regs = need_regs;
498 else if (regs_allocated == REG_REALLOCATE)
499 { /* Yes. If we need more elements than were already
500 allocated, reallocate them. If we need fewer, just
502 if (BE (need_regs > regs->rm_num_regs, 0))
504 regoff_t *new_start =
505 re_realloc (regs->rm_start, regoff_t, need_regs);
506 regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
507 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
508 return REG_UNALLOCATED;
509 regs->rm_start = new_start;
510 regs->rm_end = new_end;
511 regs->rm_num_regs = need_regs;
516 assert (regs_allocated == REG_FIXED);
517 /* This function may not be called with REG_FIXED and nregs too big. */
518 assert (regs->rm_num_regs >= nregs);
523 for (i = 0; i < nregs; ++i)
525 regs->rm_start[i] = pmatch[i].rm_so;
526 regs->rm_end[i] = pmatch[i].rm_eo;
528 for ( ; i < regs->rm_num_regs; ++i)
529 regs->rm_start[i] = regs->rm_end[i] = -1;
534 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
535 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
536 this memory for recording register information. STARTS and ENDS
537 must be allocated using the malloc library routine, and must each
538 be at least NUM_REGS * sizeof (regoff_t) bytes long.
540 If NUM_REGS == 0, then subsequent matches should allocate their own
543 Unless this function is called, the first search or match using
544 PATTERN_BUFFER will allocate its own register data, without
545 freeing the old data. */
548 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
549 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
553 bufp->re_regs_allocated = REG_REALLOCATE;
554 regs->rm_num_regs = num_regs;
555 regs->rm_start = starts;
560 bufp->re_regs_allocated = REG_UNALLOCATED;
561 regs->rm_num_regs = 0;
562 regs->rm_start = regs->rm_end = NULL;
566 weak_alias (__re_set_registers, re_set_registers)
569 /* Entry points compatible with 4.2 BSD regex library. We don't define
570 them unless specifically requested. */
572 #if defined _REGEX_RE_COMP || defined _LIBC
577 re_exec (const char *s)
579 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
581 #endif /* _REGEX_RE_COMP */
583 /* Internal entry point. */
585 /* Searches for a compiled pattern PREG in the string STRING, whose
586 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
587 mingings with regexec. START, and RANGE have the same meanings
589 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
590 otherwise return the error code.
591 Note: We assume front end functions already check ranges.
592 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
596 re_search_internal (const regex_t *preg,
597 const char *string, Idx length,
598 Idx start, regoff_t range, Idx stop,
599 size_t nmatch, regmatch_t pmatch[],
603 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
604 Idx left_lim, right_lim;
606 int fl_longest_match, match_kind;
607 Idx match_first, match_last = REG_MISSING;
610 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
611 re_match_context_t mctx = { .dfa = dfa };
613 re_match_context_t mctx;
615 char *fastmap = (preg->re_fastmap != NULL && preg->re_fastmap_accurate
616 && range && !preg->re_can_be_null) ? preg->re_fastmap : NULL;
617 unsigned REG_TRANSLATE_TYPE t =
618 (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
620 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
621 memset (&mctx, '\0', sizeof (re_match_context_t));
625 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
626 nmatch -= extra_nmatch;
628 /* Check if the DFA haven't been compiled. */
629 if (BE (preg->re_used == 0 || dfa->init_state == NULL
630 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
631 || dfa->init_state_begbuf == NULL, 0))
635 /* We assume front-end functions already check them. */
636 assert (start + range >= 0 && start + range <= length);
639 /* If initial states with non-begbuf contexts have no elements,
640 the regex must be anchored. If preg->re_newline_anchor is set,
641 we'll never use init_state_nl, so do not check it. */
642 if (dfa->init_state->nodes.nelem == 0
643 && dfa->init_state_word->nodes.nelem == 0
644 && (dfa->init_state_nl->nodes.nelem == 0
645 || !preg->re_newline_anchor))
647 if (start != 0 && start + range != 0)
652 /* We must check the longest matching, if nmatch > 0. */
653 fl_longest_match = (nmatch != 0 || dfa->nbackref);
655 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
657 preg->re_syntax & REG_IGNORE_CASE, dfa);
658 if (BE (err != REG_NOERROR, 0))
660 mctx.input.stop = stop;
661 mctx.input.raw_stop = stop;
662 mctx.input.newline_anchor = preg->re_newline_anchor;
664 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
665 if (BE (err != REG_NOERROR, 0))
668 /* We will log all the DFA states through which the dfa pass,
669 if nmatch > 1, or this dfa has "multibyte node", which is a
670 back-reference or a node which can accept multibyte character or
671 multi character collating element. */
672 if (nmatch > 1 || dfa->has_mb_node)
674 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
675 if (BE (mctx.state_log == NULL, 0))
682 mctx.state_log = NULL;
685 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
686 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
688 /* Check incrementally whether of not the input string match. */
689 incr = (range < 0) ? -1 : 1;
690 left_lim = (range < 0) ? start + range : start;
691 right_lim = (range < 0) ? start : start + range;
692 sb = dfa->mb_cur_max == 1;
695 ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
696 | (range >= 0 ? 2 : 0)
697 | (t != NULL ? 1 : 0))
700 for (;; match_first += incr)
703 if (match_first < left_lim || right_lim < match_first)
706 /* Advance as rapidly as possible through the string, until we
707 find a plausible place to start matching. This may be done
708 with varying efficiency, so there are various possibilities:
709 only the most common of them are specialized, in order to
710 save on code size. We use a switch statement for speed. */
718 /* Fastmap with single-byte translation, match forward. */
719 while (BE (match_first < right_lim, 1)
720 && !fastmap[t[(unsigned char) string[match_first]]])
722 goto forward_match_found_start_or_reached_end;
725 /* Fastmap without translation, match forward. */
726 while (BE (match_first < right_lim, 1)
727 && !fastmap[(unsigned char) string[match_first]])
730 forward_match_found_start_or_reached_end:
731 if (BE (match_first == right_lim, 0))
733 ch = match_first >= length
734 ? 0 : (unsigned char) string[match_first];
735 if (!fastmap[t ? t[ch] : ch])
742 /* Fastmap without multi-byte translation, match backwards. */
743 while (match_first >= left_lim)
745 ch = match_first >= length
746 ? 0 : (unsigned char) string[match_first];
747 if (fastmap[t ? t[ch] : ch])
751 if (match_first < left_lim)
756 /* In this case, we can't determine easily the current byte,
757 since it might be a component byte of a multibyte
758 character. Then we use the constructed buffer instead. */
761 /* If MATCH_FIRST is out of the valid range, reconstruct the
763 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
764 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
766 err = re_string_reconstruct (&mctx.input, match_first,
768 if (BE (err != REG_NOERROR, 0))
771 offset = match_first - mctx.input.raw_mbs_idx;
773 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
774 Note that MATCH_FIRST must not be smaller than 0. */
775 ch = (match_first >= length
776 ? 0 : re_string_byte_at (&mctx.input, offset));
780 if (match_first < left_lim || match_first > right_lim)
789 /* Reconstruct the buffers so that the matcher can assume that
790 the matching starts from the beginning of the buffer. */
791 err = re_string_reconstruct (&mctx.input, match_first, eflags);
792 if (BE (err != REG_NOERROR, 0))
795 #ifdef RE_ENABLE_I18N
796 /* Don't consider this char as a possible match start if it part,
797 yet isn't the head, of a multibyte character. */
798 if (!sb && !re_string_first_byte (&mctx.input, 0))
802 /* It seems to be appropriate one, then use the matcher. */
803 /* We assume that the matching starts from 0. */
804 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
805 match_last = check_matching (&mctx, fl_longest_match,
806 range >= 0 ? &match_first : NULL);
807 if (match_last != REG_MISSING)
809 if (BE (match_last == REG_ERROR, 0))
816 mctx.match_last = match_last;
817 if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
819 re_dfastate_t *pstate = mctx.state_log[match_last];
820 mctx.last_node = check_halt_state_context (&mctx, pstate,
823 if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
826 err = prune_impossible_nodes (&mctx);
827 if (err == REG_NOERROR)
829 if (BE (err != REG_NOMATCH, 0))
831 match_last = REG_MISSING;
834 break; /* We found a match. */
838 match_ctx_clean (&mctx);
842 assert (match_last != REG_MISSING);
843 assert (err == REG_NOERROR);
846 /* Set pmatch[] if we need. */
851 /* Initialize registers. */
852 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
853 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
855 /* Set the points where matching start/end. */
857 pmatch[0].rm_eo = mctx.match_last;
858 /* FIXME: This function should fail if mctx.match_last exceeds
859 the maximum possible regoff_t value. We need a new error
860 code REG_OVERFLOW. */
862 if (!preg->re_no_sub && nmatch > 1)
864 err = set_regs (preg, &mctx, nmatch, pmatch,
865 dfa->has_plural_match && dfa->nbackref > 0);
866 if (BE (err != REG_NOERROR, 0))
870 /* At last, add the offset to the each registers, since we slided
871 the buffers so that we could assume that the matching starts
873 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
874 if (pmatch[reg_idx].rm_so != -1)
876 #ifdef RE_ENABLE_I18N
877 if (BE (mctx.input.offsets_needed != 0, 0))
879 pmatch[reg_idx].rm_so =
880 (pmatch[reg_idx].rm_so == mctx.input.valid_len
881 ? mctx.input.valid_raw_len
882 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
883 pmatch[reg_idx].rm_eo =
884 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
885 ? mctx.input.valid_raw_len
886 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
889 assert (mctx.input.offsets_needed == 0);
891 pmatch[reg_idx].rm_so += match_first;
892 pmatch[reg_idx].rm_eo += match_first;
894 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
896 pmatch[nmatch + reg_idx].rm_so = -1;
897 pmatch[nmatch + reg_idx].rm_eo = -1;
901 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
902 if (dfa->subexp_map[reg_idx] != reg_idx)
904 pmatch[reg_idx + 1].rm_so
905 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
906 pmatch[reg_idx + 1].rm_eo
907 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
912 re_free (mctx.state_log);
914 match_ctx_free (&mctx);
915 re_string_destruct (&mctx.input);
921 prune_impossible_nodes (re_match_context_t *mctx)
923 re_dfa_t *const dfa = mctx->dfa;
924 Idx halt_node, match_last;
926 re_dfastate_t **sifted_states;
927 re_dfastate_t **lim_states = NULL;
928 re_sift_context_t sctx;
930 assert (mctx->state_log != NULL);
932 match_last = mctx->match_last;
933 halt_node = mctx->last_node;
934 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
935 if (BE (sifted_states == NULL, 0))
942 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
943 if (BE (lim_states == NULL, 0))
950 memset (lim_states, '\0',
951 sizeof (re_dfastate_t *) * (match_last + 1));
952 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
954 ret = sift_states_backward (mctx, &sctx);
955 re_node_set_free (&sctx.limits);
956 if (BE (ret != REG_NOERROR, 0))
958 if (sifted_states[0] != NULL || lim_states[0] != NULL)
963 if (! REG_VALID_INDEX (match_last))
968 } while (mctx->state_log[match_last] == NULL
969 || !mctx->state_log[match_last]->halt);
970 halt_node = check_halt_state_context (mctx,
971 mctx->state_log[match_last],
974 ret = merge_state_array (dfa, sifted_states, lim_states,
976 re_free (lim_states);
978 if (BE (ret != REG_NOERROR, 0))
983 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
984 ret = sift_states_backward (mctx, &sctx);
985 re_node_set_free (&sctx.limits);
986 if (BE (ret != REG_NOERROR, 0))
989 re_free (mctx->state_log);
990 mctx->state_log = sifted_states;
991 sifted_states = NULL;
992 mctx->last_node = halt_node;
993 mctx->match_last = match_last;
996 re_free (sifted_states);
997 re_free (lim_states);
1001 /* Acquire an initial state and return it.
1002 We must select appropriate initial state depending on the context,
1003 since initial states may have constraints like "\<", "^", etc.. */
1005 static inline re_dfastate_t *
1006 __attribute ((always_inline)) internal_function
1007 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1010 re_dfa_t *const dfa = mctx->dfa;
1011 if (dfa->init_state->has_constraint)
1013 unsigned int context;
1014 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1015 if (IS_WORD_CONTEXT (context))
1016 return dfa->init_state_word;
1017 else if (IS_ORDINARY_CONTEXT (context))
1018 return dfa->init_state;
1019 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1020 return dfa->init_state_begbuf;
1021 else if (IS_NEWLINE_CONTEXT (context))
1022 return dfa->init_state_nl;
1023 else if (IS_BEGBUF_CONTEXT (context))
1025 /* It is relatively rare case, then calculate on demand. */
1026 return re_acquire_state_context (err, dfa,
1027 dfa->init_state->entrance_nodes,
1031 /* Must not happen? */
1032 return dfa->init_state;
1035 return dfa->init_state;
1038 /* Check whether the regular expression match input string INPUT or not,
1039 and return the index where the matching end. Return REG_MISSING if
1040 there is no match, and return REG_ERROR in case of an error.
1041 FL_LONGEST_MATCH means we want the POSIX longest matching.
1042 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1043 next place where we may want to try matching.
1044 Note that the matcher assume that the maching starts from the current
1045 index of the buffer. */
1049 check_matching (re_match_context_t *mctx, int fl_longest_match,
1052 re_dfa_t *const dfa = mctx->dfa;
1055 Idx match_last = REG_MISSING;
1056 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1057 re_dfastate_t *cur_state;
1058 int at_init_state = p_match_first != NULL;
1059 Idx next_start_idx = cur_str_idx;
1062 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1063 /* An initial state must not be NULL (invalid). */
1064 if (BE (cur_state == NULL, 0))
1066 assert (err == REG_ESPACE);
1070 if (mctx->state_log != NULL)
1072 mctx->state_log[cur_str_idx] = cur_state;
1074 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1075 later. E.g. Processing back references. */
1076 if (BE (dfa->nbackref, 0))
1079 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1080 if (BE (err != REG_NOERROR, 0))
1083 if (cur_state->has_backref)
1085 err = transit_state_bkref (mctx, &cur_state->nodes);
1086 if (BE (err != REG_NOERROR, 0))
1092 /* If the RE accepts NULL string. */
1093 if (BE (cur_state->halt, 0))
1095 if (!cur_state->has_constraint
1096 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1098 if (!fl_longest_match)
1102 match_last = cur_str_idx;
1108 while (!re_string_eoi (&mctx->input))
1110 re_dfastate_t *old_state = cur_state;
1111 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1113 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1114 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1115 && mctx->input.valid_len < mctx->input.len))
1117 err = extend_buffers (mctx);
1118 if (BE (err != REG_NOERROR, 0))
1120 assert (err == REG_ESPACE);
1125 cur_state = transit_state (&err, mctx, cur_state);
1126 if (mctx->state_log != NULL)
1127 cur_state = merge_state_with_log (&err, mctx, cur_state);
1129 if (cur_state == NULL)
1131 /* Reached the invalid state or an error. Try to recover a valid
1132 state using the state log, if available and if we have not
1133 already found a valid (even if not the longest) match. */
1134 if (BE (err != REG_NOERROR, 0))
1137 if (mctx->state_log == NULL
1138 || (match && !fl_longest_match)
1139 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1143 if (BE (at_init_state, 0))
1145 if (old_state == cur_state)
1146 next_start_idx = next_char_idx;
1151 if (cur_state->halt)
1153 /* Reached a halt state.
1154 Check the halt state can satisfy the current context. */
1155 if (!cur_state->has_constraint
1156 || check_halt_state_context (mctx, cur_state,
1157 re_string_cur_idx (&mctx->input)))
1159 /* We found an appropriate halt state. */
1160 match_last = re_string_cur_idx (&mctx->input);
1163 /* We found a match, do not modify match_first below. */
1164 p_match_first = NULL;
1165 if (!fl_longest_match)
1172 *p_match_first += next_start_idx;
1177 /* Check NODE match the current context. */
1181 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1183 re_token_type_t type = dfa->nodes[node].type;
1184 unsigned int constraint = dfa->nodes[node].constraint;
1185 if (type != END_OF_RE)
1189 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1194 /* Check the halt state STATE match the current context.
1195 Return 0 if not match, if the node, STATE has, is a halt node and
1196 match the context, return the node. */
1200 check_halt_state_context (const re_match_context_t *mctx,
1201 const re_dfastate_t *state, Idx idx)
1204 unsigned int context;
1206 assert (state->halt);
1208 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1209 for (i = 0; i < state->nodes.nelem; ++i)
1210 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1211 return state->nodes.elems[i];
1215 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1216 corresponding to the DFA).
1217 Return the destination node, and update EPS_VIA_NODES;
1218 return REG_MISSING in case of errors. */
1222 proceed_next_node (const re_match_context_t *mctx,
1223 Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1224 re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1226 re_dfa_t *const dfa = mctx->dfa;
1229 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1231 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1232 re_node_set *edests = &dfa->edests[node];
1234 err = re_node_set_insert (eps_via_nodes, node);
1235 if (BE (err < 0, 0))
1237 /* Pick up a valid destination, or return REG_MISSING if none
1239 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1241 Idx candidate = edests->elems[i];
1242 if (!re_node_set_contains (cur_nodes, candidate))
1244 if (dest_node == REG_MISSING)
1245 dest_node = candidate;
1249 /* In order to avoid infinite loop like "(a*)*", return the second
1250 epsilon-transition if the first was already considered. */
1251 if (re_node_set_contains (eps_via_nodes, dest_node))
1254 /* Otherwise, push the second epsilon-transition on the fail stack. */
1256 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1260 /* We know we are going to exit. */
1269 re_token_type_t type = dfa->nodes[node].type;
1271 #ifdef RE_ENABLE_I18N
1272 if (dfa->nodes[node].accept_mb)
1273 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1275 #endif /* RE_ENABLE_I18N */
1276 if (type == OP_BACK_REF)
1278 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1279 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1282 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1286 char *buf = (char *) re_string_get_buffer (&mctx->input);
1287 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1296 err = re_node_set_insert (eps_via_nodes, node);
1297 if (BE (err < 0, 0))
1299 dest_node = dfa->edests[node].elems[0];
1300 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1307 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1309 Idx dest_node = dfa->nexts[node];
1310 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1311 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1312 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1315 re_node_set_empty (eps_via_nodes);
1322 static reg_errcode_t
1324 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1325 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1328 Idx num = fs->num++;
1329 if (fs->num == fs->alloc)
1331 struct re_fail_stack_ent_t *new_array =
1332 re_realloc (fs->stack, struct re_fail_stack_ent_t, fs->alloc * 2);
1333 if (new_array == NULL)
1336 fs->stack = new_array;
1338 fs->stack[num].idx = str_idx;
1339 fs->stack[num].node = dest_node;
1340 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1341 if (fs->stack[num].regs == NULL)
1343 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1344 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1350 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1351 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1353 Idx num = --fs->num;
1354 assert (REG_VALID_INDEX (num));
1355 *pidx = fs->stack[num].idx;
1356 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1357 re_node_set_free (eps_via_nodes);
1358 re_free (fs->stack[num].regs);
1359 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1360 return fs->stack[num].node;
1363 /* Set the positions where the subexpressions are starts/ends to registers
1365 Note: We assume that pmatch[0] is already set, and
1366 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1368 static reg_errcode_t
1370 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1371 size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1373 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1375 re_node_set eps_via_nodes;
1376 struct re_fail_stack_t *fs;
1377 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1378 regmatch_t *prev_idx_match;
1379 int prev_idx_match_malloced = 0;
1382 assert (nmatch > 1);
1383 assert (mctx->state_log != NULL);
1388 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1389 if (fs->stack == NULL)
1395 cur_node = dfa->init_node;
1396 re_node_set_init_empty (&eps_via_nodes);
1398 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1399 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1402 prev_idx_match = re_malloc (regmatch_t, nmatch);
1403 if (prev_idx_match == NULL)
1405 free_fail_stack_return (fs);
1408 prev_idx_match_malloced = 1;
1410 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1412 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1414 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1416 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1421 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1422 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1424 if (reg_idx == nmatch)
1426 re_node_set_free (&eps_via_nodes);
1427 if (prev_idx_match_malloced)
1428 re_free (prev_idx_match);
1429 return free_fail_stack_return (fs);
1431 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1436 re_node_set_free (&eps_via_nodes);
1437 if (prev_idx_match_malloced)
1438 re_free (prev_idx_match);
1443 /* Proceed to next node. */
1444 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1445 &eps_via_nodes, fs);
1447 if (BE (! REG_VALID_INDEX (cur_node), 0))
1449 if (BE (cur_node == REG_ERROR, 0))
1451 re_node_set_free (&eps_via_nodes);
1452 if (prev_idx_match_malloced)
1453 re_free (prev_idx_match);
1454 free_fail_stack_return (fs);
1458 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1462 re_node_set_free (&eps_via_nodes);
1463 if (prev_idx_match_malloced)
1464 re_free (prev_idx_match);
1469 re_node_set_free (&eps_via_nodes);
1470 if (prev_idx_match_malloced)
1471 re_free (prev_idx_match);
1472 return free_fail_stack_return (fs);
1475 static reg_errcode_t
1477 free_fail_stack_return (struct re_fail_stack_t *fs)
1482 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1484 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1485 re_free (fs->stack[fs_idx].regs);
1487 re_free (fs->stack);
1494 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1495 Idx cur_node, Idx cur_idx, Idx nmatch)
1497 int type = dfa->nodes[cur_node].type;
1498 if (type == OP_OPEN_SUBEXP)
1500 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1502 /* We are at the first node of this sub expression. */
1503 if (reg_num < nmatch)
1505 pmatch[reg_num].rm_so = cur_idx;
1506 pmatch[reg_num].rm_eo = -1;
1509 else if (type == OP_CLOSE_SUBEXP)
1511 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1512 if (reg_num < nmatch)
1514 /* We are at the last node of this sub expression. */
1515 if (pmatch[reg_num].rm_so < cur_idx)
1517 pmatch[reg_num].rm_eo = cur_idx;
1518 /* This is a non-empty match or we are not inside an optional
1519 subexpression. Accept this right away. */
1520 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1524 if (dfa->nodes[cur_node].opt_subexp
1525 && prev_idx_match[reg_num].rm_so != -1)
1526 /* We transited through an empty match for an optional
1527 subexpression, like (a?)*, and this is not the subexp's
1528 first match. Copy back the old content of the registers
1529 so that matches of an inner subexpression are undone as
1530 well, like in ((a?))*. */
1531 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1533 /* We completed a subexpression, but it may be part of
1534 an optional one, so do not update PREV_IDX_MATCH. */
1535 pmatch[reg_num].rm_eo = cur_idx;
1541 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1542 and sift the nodes in each states according to the following rules.
1543 Updated state_log will be wrote to STATE_LOG.
1545 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1546 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1547 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1548 the LAST_NODE, we throw away the node `a'.
1549 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1550 string `s' and transit to `b':
1551 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1553 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1554 thrown away, we throw away the node `a'.
1555 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1556 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1558 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1559 we throw away the node `a'. */
1561 #define STATE_NODE_CONTAINS(state,node) \
1562 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1564 static reg_errcode_t
1566 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1570 Idx str_idx = sctx->last_str_idx;
1571 re_node_set cur_dest;
1574 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1577 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1578 transit to the last_node and the last_node itself. */
1579 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1580 if (BE (err != REG_NOERROR, 0))
1582 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1583 if (BE (err != REG_NOERROR, 0))
1586 /* Then check each states in the state_log. */
1589 /* Update counters. */
1590 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1591 if (null_cnt > mctx->max_mb_elem_len)
1593 memset (sctx->sifted_states, '\0',
1594 sizeof (re_dfastate_t *) * str_idx);
1595 re_node_set_free (&cur_dest);
1598 re_node_set_empty (&cur_dest);
1601 if (mctx->state_log[str_idx])
1603 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1604 if (BE (err != REG_NOERROR, 0))
1608 /* Add all the nodes which satisfy the following conditions:
1609 - It can epsilon transit to a node in CUR_DEST.
1611 And update state_log. */
1612 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1613 if (BE (err != REG_NOERROR, 0))
1618 re_node_set_free (&cur_dest);
1622 static reg_errcode_t
1624 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1625 Idx str_idx, re_node_set *cur_dest)
1627 re_dfa_t *const dfa = mctx->dfa;
1628 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1631 /* Then build the next sifted state.
1632 We build the next sifted state on `cur_dest', and update
1633 `sifted_states[str_idx]' with `cur_dest'.
1635 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1636 `cur_src' points the node_set of the old `state_log[str_idx]'
1637 (with the epsilon nodes pre-filtered out). */
1638 for (i = 0; i < cur_src->nelem; i++)
1640 Idx prev_node = cur_src->elems[i];
1645 re_token_type_t type = dfa->nodes[prev_node].type;
1646 assert (!IS_EPSILON_NODE (type));
1648 #ifdef RE_ENABLE_I18N
1649 /* If the node may accept `multi byte'. */
1650 if (dfa->nodes[prev_node].accept_mb)
1651 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1652 str_idx, sctx->last_str_idx);
1653 #endif /* RE_ENABLE_I18N */
1655 /* We don't check backreferences here.
1656 See update_cur_sifted_state(). */
1658 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1659 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1660 dfa->nexts[prev_node]))
1666 if (sctx->limits.nelem)
1668 Idx to_idx = str_idx + naccepted;
1669 if (check_dst_limits (mctx, &sctx->limits,
1670 dfa->nexts[prev_node], to_idx,
1671 prev_node, str_idx))
1674 ret = re_node_set_insert (cur_dest, prev_node);
1675 if (BE (ret == -1, 0))
1682 /* Helper functions. */
1684 static reg_errcode_t
1686 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1688 Idx top = mctx->state_log_top;
1690 if (next_state_log_idx >= mctx->input.bufs_len
1691 || (next_state_log_idx >= mctx->input.valid_len
1692 && mctx->input.valid_len < mctx->input.len))
1695 err = extend_buffers (mctx);
1696 if (BE (err != REG_NOERROR, 0))
1700 if (top < next_state_log_idx)
1702 memset (mctx->state_log + top + 1, '\0',
1703 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1704 mctx->state_log_top = next_state_log_idx;
1709 static reg_errcode_t
1711 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1716 for (st_idx = 0; st_idx < num; ++st_idx)
1718 if (dst[st_idx] == NULL)
1719 dst[st_idx] = src[st_idx];
1720 else if (src[st_idx] != NULL)
1722 re_node_set merged_set;
1723 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1724 &src[st_idx]->nodes);
1725 if (BE (err != REG_NOERROR, 0))
1727 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1728 re_node_set_free (&merged_set);
1729 if (BE (err != REG_NOERROR, 0))
1736 static reg_errcode_t
1738 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1739 Idx str_idx, re_node_set *dest_nodes)
1741 re_dfa_t *const dfa = mctx->dfa;
1743 const re_node_set *candidates;
1744 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1745 : &mctx->state_log[str_idx]->nodes);
1747 if (dest_nodes->nelem == 0)
1748 sctx->sifted_states[str_idx] = NULL;
1753 /* At first, add the nodes which can epsilon transit to a node in
1755 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1756 if (BE (err != REG_NOERROR, 0))
1759 /* Then, check the limitations in the current sift_context. */
1760 if (sctx->limits.nelem)
1762 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1763 mctx->bkref_ents, str_idx);
1764 if (BE (err != REG_NOERROR, 0))
1769 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1770 if (BE (err != REG_NOERROR, 0))
1774 if (candidates && mctx->state_log[str_idx]->has_backref)
1776 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1777 if (BE (err != REG_NOERROR, 0))
1783 static reg_errcode_t
1785 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1786 const re_node_set *candidates)
1788 reg_errcode_t err = REG_NOERROR;
1791 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1792 if (BE (err != REG_NOERROR, 0))
1795 if (!state->inveclosure.alloc)
1797 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1798 if (BE (err != REG_NOERROR, 0))
1800 for (i = 0; i < dest_nodes->nelem; i++)
1801 re_node_set_merge (&state->inveclosure,
1802 dfa->inveclosures + dest_nodes->elems[i]);
1804 return re_node_set_add_intersect (dest_nodes, candidates,
1805 &state->inveclosure);
1808 static reg_errcode_t
1810 sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1811 const re_node_set *candidates)
1815 re_node_set *inv_eclosure = dfa->inveclosures + node;
1816 re_node_set except_nodes;
1817 re_node_set_init_empty (&except_nodes);
1818 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1820 Idx cur_node = inv_eclosure->elems[ecl_idx];
1821 if (cur_node == node)
1823 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1825 Idx edst1 = dfa->edests[cur_node].elems[0];
1826 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1827 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1828 if ((!re_node_set_contains (inv_eclosure, edst1)
1829 && re_node_set_contains (dest_nodes, edst1))
1830 || (REG_VALID_NONZERO_INDEX (edst2)
1831 && !re_node_set_contains (inv_eclosure, edst2)
1832 && re_node_set_contains (dest_nodes, edst2)))
1834 err = re_node_set_add_intersect (&except_nodes, candidates,
1835 dfa->inveclosures + cur_node);
1836 if (BE (err != REG_NOERROR, 0))
1838 re_node_set_free (&except_nodes);
1844 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1846 Idx cur_node = inv_eclosure->elems[ecl_idx];
1847 if (!re_node_set_contains (&except_nodes, cur_node))
1849 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1850 re_node_set_remove_at (dest_nodes, idx);
1853 re_node_set_free (&except_nodes);
1859 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1860 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1862 re_dfa_t *const dfa = mctx->dfa;
1863 Idx lim_idx, src_pos, dst_pos;
1865 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1866 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1867 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1870 struct re_backref_cache_entry *ent;
1871 ent = mctx->bkref_ents + limits->elems[lim_idx];
1872 subexp_idx = dfa->nodes[ent->node].opr.idx;
1874 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1875 subexp_idx, dst_node, dst_idx,
1877 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1878 subexp_idx, src_node, src_idx,
1882 <src> <dst> ( <subexp> )
1883 ( <subexp> ) <src> <dst>
1884 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1885 if (src_pos == dst_pos)
1886 continue; /* This is unrelated limitation. */
1895 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1896 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1898 re_dfa_t *const dfa = mctx->dfa;
1899 re_node_set *eclosures = dfa->eclosures + from_node;
1902 /* Else, we are on the boundary: examine the nodes on the epsilon
1904 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1906 Idx node = eclosures->elems[node_idx];
1907 switch (dfa->nodes[node].type)
1910 if (bkref_idx != REG_MISSING)
1912 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1918 if (ent->node != node)
1922 < CHAR_BIT * sizeof ent->eps_reachable_subexps_map
1923 && !(ent->eps_reachable_subexps_map & (1u << subexp_idx)))
1926 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1927 OP_CLOSE_SUBEXP cases below. But, if the
1928 destination node is the same node as the source
1929 node, don't recurse because it would cause an
1930 infinite loop: a regex that exhibits this behavior
1932 dst = dfa->edests[node].elems[0];
1933 if (dst == from_node)
1937 else /* if (boundaries & 2) */
1942 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1944 if (cpos == -1 /* && (boundaries & 1) */)
1946 if (cpos == 0 && (boundaries & 2))
1950 < CHAR_BIT * sizeof ent->eps_reachable_subexps_map)
1951 ent->eps_reachable_subexps_map &= ~(1u << subexp_idx);
1953 while (ent++->more);
1957 case OP_OPEN_SUBEXP:
1958 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1962 case OP_CLOSE_SUBEXP:
1963 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1972 return (boundaries & 2) ? 1 : 0;
1977 check_dst_limits_calc_pos (const re_match_context_t *mctx,
1978 Idx limit, Idx subexp_idx,
1979 Idx from_node, Idx str_idx, Idx bkref_idx)
1981 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1984 /* If we are outside the range of the subexpression, return -1 or 1. */
1985 if (str_idx < lim->subexp_from)
1988 if (lim->subexp_to < str_idx)
1991 /* If we are within the subexpression, return 0. */
1992 boundaries = (str_idx == lim->subexp_from);
1993 boundaries |= (str_idx == lim->subexp_to) << 1;
1994 if (boundaries == 0)
1997 /* Else, examine epsilon closure. */
1998 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1999 from_node, bkref_idx);
2002 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2003 which are against limitations from DEST_NODES. */
2005 static reg_errcode_t
2007 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2008 const re_node_set *candidates, re_node_set *limits,
2009 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2012 Idx node_idx, lim_idx;
2014 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2017 struct re_backref_cache_entry *ent;
2018 ent = bkref_ents + limits->elems[lim_idx];
2020 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2021 continue; /* This is unrelated limitation. */
2023 subexp_idx = dfa->nodes[ent->node].opr.idx;
2024 if (ent->subexp_to == str_idx)
2026 Idx ops_node = REG_MISSING;
2027 Idx cls_node = REG_MISSING;
2028 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2030 Idx node = dest_nodes->elems[node_idx];
2031 re_token_type_t type = dfa->nodes[node].type;
2032 if (type == OP_OPEN_SUBEXP
2033 && subexp_idx == dfa->nodes[node].opr.idx)
2035 else if (type == OP_CLOSE_SUBEXP
2036 && subexp_idx == dfa->nodes[node].opr.idx)
2040 /* Check the limitation of the open subexpression. */
2041 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2042 if (REG_VALID_INDEX (ops_node))
2044 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2046 if (BE (err != REG_NOERROR, 0))
2050 /* Check the limitation of the close subexpression. */
2051 if (REG_VALID_INDEX (cls_node))
2052 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2054 Idx node = dest_nodes->elems[node_idx];
2055 if (!re_node_set_contains (dfa->inveclosures + node,
2057 && !re_node_set_contains (dfa->eclosures + node,
2060 /* It is against this limitation.
2061 Remove it form the current sifted state. */
2062 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2064 if (BE (err != REG_NOERROR, 0))
2070 else /* (ent->subexp_to != str_idx) */
2072 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074 Idx node = dest_nodes->elems[node_idx];
2075 re_token_type_t type = dfa->nodes[node].type;
2076 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2078 if (subexp_idx != dfa->nodes[node].opr.idx)
2080 /* It is against this limitation.
2081 Remove it form the current sifted state. */
2082 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2084 if (BE (err != REG_NOERROR, 0))
2093 static reg_errcode_t
2095 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2096 Idx str_idx, const re_node_set *candidates)
2098 re_dfa_t *const dfa = mctx->dfa;
2101 re_sift_context_t local_sctx;
2102 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2104 if (first_idx == REG_MISSING)
2107 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2109 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2112 re_token_type_t type;
2113 struct re_backref_cache_entry *entry;
2114 node = candidates->elems[node_idx];
2115 type = dfa->nodes[node].type;
2116 /* Avoid infinite loop for the REs like "()\1+". */
2117 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2119 if (type != OP_BACK_REF)
2122 entry = mctx->bkref_ents + first_idx;
2123 enabled_idx = first_idx;
2126 Idx subexp_len, to_idx, dst_node, ret;
2127 re_dfastate_t *cur_state;
2129 if (entry->node != node)
2131 subexp_len = entry->subexp_to - entry->subexp_from;
2132 to_idx = str_idx + subexp_len;
2133 dst_node = (subexp_len ? dfa->nexts[node]
2134 : dfa->edests[node].elems[0]);
2136 if (to_idx > sctx->last_str_idx
2137 || sctx->sifted_states[to_idx] == NULL
2138 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2139 || check_dst_limits (mctx, &sctx->limits, node,
2140 str_idx, dst_node, to_idx))
2143 if (local_sctx.sifted_states == NULL)
2146 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2147 if (BE (err != REG_NOERROR, 0))
2150 local_sctx.last_node = node;
2151 local_sctx.last_str_idx = str_idx;
2152 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2153 if (BE (ret < 0, 0))
2158 cur_state = local_sctx.sifted_states[str_idx];
2159 err = sift_states_backward (mctx, &local_sctx);
2160 if (BE (err != REG_NOERROR, 0))
2162 if (sctx->limited_states != NULL)
2164 err = merge_state_array (dfa, sctx->limited_states,
2165 local_sctx.sifted_states,
2167 if (BE (err != REG_NOERROR, 0))
2170 local_sctx.sifted_states[str_idx] = cur_state;
2171 re_node_set_remove (&local_sctx.limits, enabled_idx);
2173 /* mctx->bkref_ents may have changed, reload the pointer. */
2174 entry = mctx->bkref_ents + enabled_idx;
2176 while (enabled_idx++, entry++->more);
2180 if (local_sctx.sifted_states != NULL)
2182 re_node_set_free (&local_sctx.limits);
2189 #ifdef RE_ENABLE_I18N
2192 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2193 Idx node_idx, Idx str_idx, Idx max_str_idx)
2195 re_dfa_t *const dfa = mctx->dfa;
2197 /* Check the node can accept `multi byte'. */
2198 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2199 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2200 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2201 dfa->nexts[node_idx]))
2202 /* The node can't accept the `multi byte', or the
2203 destination was already thrown away, then the node
2204 could't accept the current input `multi byte'. */
2206 /* Otherwise, it is sure that the node could accept
2207 `naccepted' bytes input. */
2210 #endif /* RE_ENABLE_I18N */
2213 /* Functions for state transition. */
2215 /* Return the next state to which the current state STATE will transit by
2216 accepting the current input byte, and update STATE_LOG if necessary.
2217 If STATE can accept a multibyte char/collating element/back reference
2218 update the destination of STATE_LOG. */
2220 static re_dfastate_t *
2222 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2223 re_dfastate_t *state)
2225 re_dfastate_t **trtable;
2228 #ifdef RE_ENABLE_I18N
2229 /* If the current state can accept multibyte. */
2230 if (BE (state->accept_mb, 0))
2232 *err = transit_state_mb (mctx, state);
2233 if (BE (*err != REG_NOERROR, 0))
2236 #endif /* RE_ENABLE_I18N */
2238 /* Then decide the next state with the single byte. */
2241 /* don't use transition table */
2242 return transit_state_sb (err, mctx, state);
2245 /* Use transition table */
2246 ch = re_string_fetch_byte (&mctx->input);
2249 trtable = state->trtable;
2250 if (BE (trtable != NULL, 1))
2253 trtable = state->word_trtable;
2254 if (BE (trtable != NULL, 1))
2256 unsigned int context;
2258 = re_string_context_at (&mctx->input,
2259 re_string_cur_idx (&mctx->input) - 1,
2261 if (IS_WORD_CONTEXT (context))
2262 return trtable[ch + SBC_MAX];
2267 if (!build_trtable (mctx->dfa, state))
2273 /* Retry, we now have a transition table. */
2277 /* Update the state_log if we need */
2280 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2281 re_dfastate_t *next_state)
2283 re_dfa_t *const dfa = mctx->dfa;
2284 Idx cur_idx = re_string_cur_idx (&mctx->input);
2286 if (cur_idx > mctx->state_log_top)
2288 mctx->state_log[cur_idx] = next_state;
2289 mctx->state_log_top = cur_idx;
2291 else if (mctx->state_log[cur_idx] == 0)
2293 mctx->state_log[cur_idx] = next_state;
2297 re_dfastate_t *pstate;
2298 unsigned int context;
2299 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2300 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2301 the destination of a multibyte char/collating element/
2302 back reference. Then the next state is the union set of
2303 these destinations and the results of the transition table. */
2304 pstate = mctx->state_log[cur_idx];
2305 log_nodes = pstate->entrance_nodes;
2306 if (next_state != NULL)
2308 table_nodes = next_state->entrance_nodes;
2309 *err = re_node_set_init_union (&next_nodes, table_nodes,
2311 if (BE (*err != REG_NOERROR, 0))
2315 next_nodes = *log_nodes;
2316 /* Note: We already add the nodes of the initial state,
2317 then we don't need to add them here. */
2319 context = re_string_context_at (&mctx->input,
2320 re_string_cur_idx (&mctx->input) - 1,
2322 next_state = mctx->state_log[cur_idx]
2323 = re_acquire_state_context (err, dfa, &next_nodes, context);
2324 /* We don't need to check errors here, since the return value of
2325 this function is next_state and ERR is already set. */
2327 if (table_nodes != NULL)
2328 re_node_set_free (&next_nodes);
2331 if (BE (dfa->nbackref, 0) && next_state != NULL)
2333 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2334 later. We must check them here, since the back references in the
2335 next state might use them. */
2336 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2338 if (BE (*err != REG_NOERROR, 0))
2341 /* If the next state has back references. */
2342 if (next_state->has_backref)
2344 *err = transit_state_bkref (mctx, &next_state->nodes);
2345 if (BE (*err != REG_NOERROR, 0))
2347 next_state = mctx->state_log[cur_idx];
2354 /* Skip bytes in the input that correspond to part of a
2355 multi-byte match, then look in the log for a state
2356 from which to restart matching. */
2357 static re_dfastate_t *
2359 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2361 re_dfastate_t *cur_state = NULL;
2364 Idx max = mctx->state_log_top;
2365 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2369 if (++cur_str_idx > max)
2371 re_string_skip_bytes (&mctx->input, 1);
2373 while (mctx->state_log[cur_str_idx] == NULL);
2375 cur_state = merge_state_with_log (err, mctx, NULL);
2377 while (*err == REG_NOERROR && cur_state == NULL);
2381 /* Helper functions for transit_state. */
2383 /* From the node set CUR_NODES, pick up the nodes whose types are
2384 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2385 expression. And register them to use them later for evaluating the
2386 correspoding back references. */
2388 static reg_errcode_t
2390 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2393 re_dfa_t *const dfa = mctx->dfa;
2397 /* TODO: This isn't efficient.
2398 Because there might be more than one nodes whose types are
2399 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2402 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2404 Idx node = cur_nodes->elems[node_idx];
2405 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2406 && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map
2407 && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx))
2409 err = match_ctx_add_subtop (mctx, node, str_idx);
2410 if (BE (err != REG_NOERROR, 0))
2418 /* Return the next state to which the current state STATE will transit by
2419 accepting the current input byte. */
2421 static re_dfastate_t *
2422 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2423 re_dfastate_t *state)
2425 re_dfa_t *const dfa = mctx->dfa;
2426 re_node_set next_nodes;
2427 re_dfastate_t *next_state;
2428 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2429 unsigned int context;
2431 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432 if (BE (*err != REG_NOERROR, 0))
2434 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2436 Idx cur_node = state->nodes.elems[node_cnt];
2437 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2439 *err = re_node_set_merge (&next_nodes,
2440 dfa->eclosures + dfa->nexts[cur_node]);
2441 if (BE (*err != REG_NOERROR, 0))
2443 re_node_set_free (&next_nodes);
2448 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2449 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450 /* We don't need to check errors here, since the return value of
2451 this function is next_state and ERR is already set. */
2453 re_node_set_free (&next_nodes);
2454 re_string_skip_bytes (&mctx->input, 1);
2459 #ifdef RE_ENABLE_I18N
2460 static reg_errcode_t
2462 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2464 re_dfa_t *const dfa = mctx->dfa;
2468 for (i = 0; i < pstate->nodes.nelem; ++i)
2470 re_node_set dest_nodes, *new_nodes;
2471 Idx cur_node_idx = pstate->nodes.elems[i];
2474 unsigned int context;
2475 re_dfastate_t *dest_state;
2477 if (!dfa->nodes[cur_node_idx].accept_mb)
2480 if (dfa->nodes[cur_node_idx].constraint)
2482 context = re_string_context_at (&mctx->input,
2483 re_string_cur_idx (&mctx->input),
2485 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2490 /* How many bytes the node can accept? */
2491 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2492 re_string_cur_idx (&mctx->input));
2496 /* The node can accepts `naccepted' bytes. */
2497 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2498 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2499 : mctx->max_mb_elem_len);
2500 err = clean_state_log_if_needed (mctx, dest_idx);
2501 if (BE (err != REG_NOERROR, 0))
2504 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2506 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2508 dest_state = mctx->state_log[dest_idx];
2509 if (dest_state == NULL)
2510 dest_nodes = *new_nodes;
2513 err = re_node_set_init_union (&dest_nodes,
2514 dest_state->entrance_nodes, new_nodes);
2515 if (BE (err != REG_NOERROR, 0))
2518 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2519 mctx->state_log[dest_idx]
2520 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521 if (dest_state != NULL)
2522 re_node_set_free (&dest_nodes);
2523 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2528 #endif /* RE_ENABLE_I18N */
2530 static reg_errcode_t
2532 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2534 re_dfa_t *const dfa = mctx->dfa;
2537 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2539 for (i = 0; i < nodes->nelem; ++i)
2541 Idx dest_str_idx, prev_nelem, bkc_idx;
2542 Idx node_idx = nodes->elems[i];
2543 unsigned int context;
2544 const re_token_t *node = dfa->nodes + node_idx;
2545 re_node_set *new_dest_nodes;
2547 /* Check whether `node' is a backreference or not. */
2548 if (node->type != OP_BACK_REF)
2551 if (node->constraint)
2553 context = re_string_context_at (&mctx->input, cur_str_idx,
2555 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2559 /* `node' is a backreference.
2560 Check the substring which the substring matched. */
2561 bkc_idx = mctx->nbkref_ents;
2562 err = get_subexp (mctx, node_idx, cur_str_idx);
2563 if (BE (err != REG_NOERROR, 0))
2566 /* And add the epsilon closures (which is `new_dest_nodes') of
2567 the backreference to appropriate state_log. */
2569 assert (dfa->nexts[node_idx] != REG_MISSING);
2571 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2574 re_dfastate_t *dest_state;
2575 struct re_backref_cache_entry *bkref_ent;
2576 bkref_ent = mctx->bkref_ents + bkc_idx;
2577 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2579 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2580 new_dest_nodes = (subexp_len == 0
2581 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2582 : dfa->eclosures + dfa->nexts[node_idx]);
2583 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2584 - bkref_ent->subexp_from);
2585 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2587 dest_state = mctx->state_log[dest_str_idx];
2588 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2589 : mctx->state_log[cur_str_idx]->nodes.nelem);
2590 /* Add `new_dest_node' to state_log. */
2591 if (dest_state == NULL)
2593 mctx->state_log[dest_str_idx]
2594 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2596 if (BE (mctx->state_log[dest_str_idx] == NULL
2597 && err != REG_NOERROR, 0))
2602 re_node_set dest_nodes;
2603 err = re_node_set_init_union (&dest_nodes,
2604 dest_state->entrance_nodes,
2606 if (BE (err != REG_NOERROR, 0))
2608 re_node_set_free (&dest_nodes);
2611 mctx->state_log[dest_str_idx]
2612 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2613 re_node_set_free (&dest_nodes);
2614 if (BE (mctx->state_log[dest_str_idx] == NULL
2615 && err != REG_NOERROR, 0))
2618 /* We need to check recursively if the backreference can epsilon
2621 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2623 err = check_subexp_matching_top (mctx, new_dest_nodes,
2625 if (BE (err != REG_NOERROR, 0))
2627 err = transit_state_bkref (mctx, new_dest_nodes);
2628 if (BE (err != REG_NOERROR, 0))
2638 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2639 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2640 Note that we might collect inappropriate candidates here.
2641 However, the cost of checking them strictly here is too high, then we
2642 delay these checking for prune_impossible_nodes(). */
2644 static reg_errcode_t
2646 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2648 re_dfa_t *const dfa = mctx->dfa;
2649 Idx subexp_num, sub_top_idx;
2650 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2651 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2652 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2653 if (cache_idx != REG_MISSING)
2655 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2657 if (entry->node == bkref_node)
2658 return REG_NOERROR; /* We already checked it. */
2659 while (entry++->more);
2662 subexp_num = dfa->nodes[bkref_node].opr.idx;
2664 /* For each sub expression */
2665 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2668 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2669 re_sub_match_last_t *sub_last;
2670 Idx sub_last_idx, sl_str, bkref_str_off;
2672 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2673 continue; /* It isn't related. */
2675 sl_str = sub_top->str_idx;
2676 bkref_str_off = bkref_str_idx;
2677 /* At first, check the last node of sub expressions we already
2679 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2681 regoff_t sl_str_diff;
2682 sub_last = sub_top->lasts[sub_last_idx];
2683 sl_str_diff = sub_last->str_idx - sl_str;
2684 /* The matched string by the sub expression match with the substring
2685 at the back reference? */
2686 if (sl_str_diff > 0)
2688 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2690 /* Not enough chars for a successful match. */
2691 if (bkref_str_off + sl_str_diff > mctx->input.len)
2694 err = clean_state_log_if_needed (mctx,
2697 if (BE (err != REG_NOERROR, 0))
2699 buf = (const char *) re_string_get_buffer (&mctx->input);
2701 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2702 break; /* We don't need to search this sub expression any more. */
2704 bkref_str_off += sl_str_diff;
2705 sl_str += sl_str_diff;
2706 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2709 /* Reload buf, since the preceding call might have reallocated
2711 buf = (const char *) re_string_get_buffer (&mctx->input);
2713 if (err == REG_NOMATCH)
2715 if (BE (err != REG_NOERROR, 0))
2719 if (sub_last_idx < sub_top->nlasts)
2721 if (sub_last_idx > 0)
2723 /* Then, search for the other last nodes of the sub expression. */
2724 for (; sl_str <= bkref_str_idx; ++sl_str)
2727 regoff_t sl_str_off;
2728 const re_node_set *nodes;
2729 sl_str_off = sl_str - sub_top->str_idx;
2730 /* The matched string by the sub expression match with the substring
2731 at the back reference? */
2734 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2736 /* If we are at the end of the input, we cannot match. */
2737 if (bkref_str_off >= mctx->input.len)
2740 err = extend_buffers (mctx);
2741 if (BE (err != REG_NOERROR, 0))
2744 buf = (const char *) re_string_get_buffer (&mctx->input);
2746 if (buf [bkref_str_off++] != buf[sl_str - 1])
2747 break; /* We don't need to search this sub expression
2750 if (mctx->state_log[sl_str] == NULL)
2752 /* Does this state have a ')' of the sub expression? */
2753 nodes = &mctx->state_log[sl_str]->nodes;
2754 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2755 if (cls_node == REG_MISSING)
2757 if (sub_top->path == NULL)
2759 sub_top->path = re_calloc (state_array_t,
2760 sl_str - sub_top->str_idx + 1);
2761 if (sub_top->path == NULL)
2764 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2765 in the current context? */
2766 err = check_arrival (mctx, sub_top->path, sub_top->node,
2767 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2768 if (err == REG_NOMATCH)
2770 if (BE (err != REG_NOERROR, 0))
2772 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2773 if (BE (sub_last == NULL, 0))
2775 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2777 if (err == REG_NOMATCH)
2784 /* Helper functions for get_subexp(). */
2786 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2787 If it can arrive, register the sub expression expressed with SUB_TOP
2790 static reg_errcode_t
2792 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2793 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2797 /* Can the subexpression arrive the back reference? */
2798 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2799 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2800 if (err != REG_NOERROR)
2802 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2804 if (BE (err != REG_NOERROR, 0))
2806 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2807 return clean_state_log_if_needed (mctx, to_idx);
2810 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2811 Search '(' if FL_OPEN, or search ')' otherwise.
2812 TODO: This function isn't efficient...
2813 Because there might be more than one nodes whose types are
2814 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2820 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2821 Idx subexp_idx, int type)
2824 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2826 Idx cls_node = nodes->elems[cls_idx];
2827 const re_token_t *node = dfa->nodes + cls_node;
2828 if (node->type == type
2829 && node->opr.idx == subexp_idx)
2835 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2836 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2838 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2840 static reg_errcode_t
2842 check_arrival (re_match_context_t *mctx, state_array_t *path,
2843 Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2846 re_dfa_t *const dfa = mctx->dfa;
2848 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2849 re_dfastate_t *cur_state = NULL;
2850 re_node_set *cur_nodes, next_nodes;
2851 re_dfastate_t **backup_state_log;
2852 unsigned int context;
2854 subexp_num = dfa->nodes[top_node].opr.idx;
2855 /* Extend the buffer if we need. */
2856 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2858 re_dfastate_t **new_array;
2859 Idx old_alloc = path->alloc;
2860 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2861 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2862 if (new_array == NULL)
2864 path->alloc = old_alloc;
2867 path->array = new_array;
2868 memset (new_array + old_alloc, '\0',
2869 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2872 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2874 /* Temporary modify MCTX. */
2875 backup_state_log = mctx->state_log;
2876 backup_cur_idx = mctx->input.cur_idx;
2877 mctx->state_log = path->array;
2878 mctx->input.cur_idx = str_idx;
2880 /* Setup initial node set. */
2881 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2882 if (str_idx == top_str)
2884 err = re_node_set_init_1 (&next_nodes, top_node);
2885 if (BE (err != REG_NOERROR, 0))
2887 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2888 if (BE (err != REG_NOERROR, 0))
2890 re_node_set_free (&next_nodes);
2896 cur_state = mctx->state_log[str_idx];
2897 if (cur_state && cur_state->has_backref)
2899 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2900 if (BE ( err != REG_NOERROR, 0))
2904 re_node_set_init_empty (&next_nodes);
2906 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2908 if (next_nodes.nelem)
2910 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2912 if (BE ( err != REG_NOERROR, 0))
2914 re_node_set_free (&next_nodes);
2918 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2919 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2921 re_node_set_free (&next_nodes);
2924 mctx->state_log[str_idx] = cur_state;
2927 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2929 re_node_set_empty (&next_nodes);
2930 if (mctx->state_log[str_idx + 1])
2932 err = re_node_set_merge (&next_nodes,
2933 &mctx->state_log[str_idx + 1]->nodes);
2934 if (BE (err != REG_NOERROR, 0))
2936 re_node_set_free (&next_nodes);
2942 err = check_arrival_add_next_nodes (mctx, str_idx,
2943 &cur_state->non_eps_nodes, &next_nodes);
2944 if (BE (err != REG_NOERROR, 0))
2946 re_node_set_free (&next_nodes);
2951 if (next_nodes.nelem)
2953 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2954 if (BE (err != REG_NOERROR, 0))
2956 re_node_set_free (&next_nodes);
2959 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2961 if (BE ( err != REG_NOERROR, 0))
2963 re_node_set_free (&next_nodes);
2967 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2968 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2969 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2971 re_node_set_free (&next_nodes);
2974 mctx->state_log[str_idx] = cur_state;
2975 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2977 re_node_set_free (&next_nodes);
2978 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2979 : &mctx->state_log[last_str]->nodes);
2980 path->next_idx = str_idx;
2983 mctx->state_log = backup_state_log;
2984 mctx->input.cur_idx = backup_cur_idx;
2986 /* Then check the current node set has the node LAST_NODE. */
2987 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2993 /* Helper functions for check_arrival. */
2995 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2997 TODO: This function is similar to the functions transit_state*(),
2998 however this function has many additional works.
2999 Can't we unify them? */
3001 static reg_errcode_t
3003 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3004 re_node_set *cur_nodes,
3005 re_node_set *next_nodes)
3007 re_dfa_t *const dfa = mctx->dfa;
3011 re_node_set union_set;
3012 re_node_set_init_empty (&union_set);
3013 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3016 Idx cur_node = cur_nodes->elems[cur_idx];
3018 re_token_type_t type = dfa->nodes[cur_node].type;
3019 assert (!IS_EPSILON_NODE (type));
3021 #ifdef RE_ENABLE_I18N
3022 /* If the node may accept `multi byte'. */
3023 if (dfa->nodes[cur_node].accept_mb)
3025 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3029 re_dfastate_t *dest_state;
3030 Idx next_node = dfa->nexts[cur_node];
3031 Idx next_idx = str_idx + naccepted;
3032 dest_state = mctx->state_log[next_idx];
3033 re_node_set_empty (&union_set);
3036 err = re_node_set_merge (&union_set, &dest_state->nodes);
3037 if (BE (err != REG_NOERROR, 0))
3039 re_node_set_free (&union_set);
3043 result = re_node_set_insert (&union_set, next_node);
3044 if (BE (result < 0, 0))
3046 re_node_set_free (&union_set);
3049 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3051 if (BE (mctx->state_log[next_idx] == NULL
3052 && err != REG_NOERROR, 0))
3054 re_node_set_free (&union_set);
3059 #endif /* RE_ENABLE_I18N */
3061 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3063 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3064 if (BE (result < 0, 0))
3066 re_node_set_free (&union_set);
3071 re_node_set_free (&union_set);
3075 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3076 CUR_NODES, however exclude the nodes which are:
3077 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3078 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3081 static reg_errcode_t
3083 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3084 Idx ex_subexp, int type)
3087 Idx idx, outside_node;
3088 re_node_set new_nodes;
3090 assert (cur_nodes->nelem);
3092 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3093 if (BE (err != REG_NOERROR, 0))
3095 /* Create a new node set NEW_NODES with the nodes which are epsilon
3096 closures of the node in CUR_NODES. */
3098 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3100 Idx cur_node = cur_nodes->elems[idx];
3101 re_node_set *eclosure = dfa->eclosures + cur_node;
3102 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3103 if (outside_node == REG_MISSING)
3105 /* There are no problematic nodes, just merge them. */
3106 err = re_node_set_merge (&new_nodes, eclosure);
3107 if (BE (err != REG_NOERROR, 0))
3109 re_node_set_free (&new_nodes);
3115 /* There are problematic nodes, re-calculate incrementally. */
3116 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3118 if (BE (err != REG_NOERROR, 0))
3120 re_node_set_free (&new_nodes);
3125 re_node_set_free (cur_nodes);
3126 *cur_nodes = new_nodes;
3130 /* Helper function for check_arrival_expand_ecl.
3131 Check incrementally the epsilon closure of TARGET, and if it isn't
3132 problematic append it to DST_NODES. */
3134 static reg_errcode_t
3136 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3137 Idx target, Idx ex_subexp, int type)
3140 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3144 if (dfa->nodes[cur_node].type == type
3145 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3147 if (type == OP_CLOSE_SUBEXP)
3149 err = re_node_set_insert (dst_nodes, cur_node);
3150 if (BE (err == -1, 0))
3155 err = re_node_set_insert (dst_nodes, cur_node);
3156 if (BE (err == -1, 0))
3158 if (dfa->edests[cur_node].nelem == 0)
3160 if (dfa->edests[cur_node].nelem == 2)
3163 check_arrival_expand_ecl_sub (dfa, dst_nodes,
3164 dfa->edests[cur_node].elems[1],
3166 if (BE (ret != REG_NOERROR, 0))
3169 cur_node = dfa->edests[cur_node].elems[0];
3175 /* For all the back references in the current state, calculate the
3176 destination of the back references by the appropriate entry
3177 in MCTX->BKREF_ENTS. */
3179 static reg_errcode_t
3181 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3182 Idx cur_str, Idx subexp_num, int type)
3184 re_dfa_t *const dfa = mctx->dfa;
3186 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3187 struct re_backref_cache_entry *ent;
3189 if (cache_idx_start == REG_MISSING)
3193 ent = mctx->bkref_ents + cache_idx_start;
3196 Idx to_idx, next_node;
3198 /* Is this entry ENT is appropriate? */
3199 if (!re_node_set_contains (cur_nodes, ent->node))
3202 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3203 /* Calculate the destination of the back reference, and append it
3204 to MCTX->STATE_LOG. */
3205 if (to_idx == cur_str)
3207 /* The backreference did epsilon transit, we must re-check all the
3208 node in the current state. */
3209 re_node_set new_dests;
3210 reg_errcode_t err2, err3;
3211 next_node = dfa->edests[ent->node].elems[0];
3212 if (re_node_set_contains (cur_nodes, next_node))
3214 err = re_node_set_init_1 (&new_dests, next_node);
3215 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3216 err3 = re_node_set_merge (cur_nodes, &new_dests);
3217 re_node_set_free (&new_dests);
3218 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3219 || err3 != REG_NOERROR, 0))
3221 err = (err != REG_NOERROR ? err
3222 : (err2 != REG_NOERROR ? err2 : err3));
3225 /* TODO: It is still inefficient... */
3230 re_node_set union_set;
3231 next_node = dfa->nexts[ent->node];
3232 if (mctx->state_log[to_idx])
3235 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3238 err = re_node_set_init_copy (&union_set,
3239 &mctx->state_log[to_idx]->nodes);
3240 ret = re_node_set_insert (&union_set, next_node);
3241 if (BE (err != REG_NOERROR || ret < 0, 0))
3243 re_node_set_free (&union_set);
3244 err = err != REG_NOERROR ? err : REG_ESPACE;
3250 err = re_node_set_init_1 (&union_set, next_node);
3251 if (BE (err != REG_NOERROR, 0))
3254 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3255 re_node_set_free (&union_set);
3256 if (BE (mctx->state_log[to_idx] == NULL
3257 && err != REG_NOERROR, 0))
3261 while (ent++->more);
3265 /* Build transition table for the state.
3266 Return 1 if succeeded, otherwise return NULL. */
3270 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3274 int ch, need_word_trtable = 0;
3275 unsigned int elem, mask;
3276 int dests_node_malloced = 0, dest_states_malloced = 0;
3277 Idx ndests; /* Number of the destination states from `state'. */
3278 re_dfastate_t **trtable;
3279 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3280 re_node_set follows, *dests_node;
3284 /* We build DFA states which corresponds to the destination nodes
3285 from `state'. `dests_node[i]' represents the nodes which i-th
3286 destination state contains, and `dests_ch[i]' represents the
3287 characters which i-th destination state accepts. */
3288 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3289 dests_node = (re_node_set *)
3290 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3293 dests_node = (re_node_set *)
3294 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3295 if (BE (dests_node == NULL, 0))
3297 dests_node_malloced = 1;
3299 dests_ch = (bitset *) (dests_node + SBC_MAX);
3301 /* Initialize transiton table. */
3302 state->word_trtable = state->trtable = NULL;
3304 /* At first, group all nodes belonging to `state' into several
3306 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3307 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3309 if (dests_node_malloced)
3311 /* Return 0 in case of an error, 1 otherwise. */
3314 state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3320 err = re_node_set_alloc (&follows, ndests + 1);
3321 if (BE (err != REG_NOERROR, 0))
3324 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3325 + ndests * 3 * sizeof (re_dfastate_t *)))
3326 dest_states = (re_dfastate_t **)
3327 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3330 dest_states = (re_dfastate_t **)
3331 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3332 if (BE (dest_states == NULL, 0))
3335 if (dest_states_malloced)
3337 re_node_set_free (&follows);
3338 for (i = 0; i < ndests; ++i)
3339 re_node_set_free (dests_node + i);
3340 if (dests_node_malloced)
3344 dest_states_malloced = 1;
3346 dest_states_word = dest_states + ndests;
3347 dest_states_nl = dest_states_word + ndests;
3348 bitset_empty (acceptable);
3350 /* Then build the states for all destinations. */
3351 for (i = 0; i < ndests; ++i)
3354 re_node_set_empty (&follows);
3355 /* Merge the follows of this destination states. */
3356 for (j = 0; j < dests_node[i].nelem; ++j)
3358 next_node = dfa->nexts[dests_node[i].elems[j]];
3359 if (next_node != REG_MISSING)
3361 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3362 if (BE (err != REG_NOERROR, 0))
3366 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3367 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3369 /* If the new state has context constraint,
3370 build appropriate states for these contexts. */
3371 if (dest_states[i]->has_constraint)
3373 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3375 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3378 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3379 need_word_trtable = 1;
3381 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3383 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3388 dest_states_word[i] = dest_states[i];
3389 dest_states_nl[i] = dest_states[i];
3391 bitset_merge (acceptable, dests_ch[i]);
3394 if (!BE (need_word_trtable, 0))
3396 /* We don't care about whether the following character is a word
3397 character, or we are in a single-byte character set so we can
3398 discern by looking at the character code: allocate a
3399 256-entry transition table. */
3400 trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3401 if (BE (trtable == NULL, 0))
3404 /* For all characters ch...: */
3405 for (i = 0; i < BITSET_UINTS; ++i)
3406 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3408 mask <<= 1, elem >>= 1, ++ch)
3409 if (BE (elem & 1, 0))
3411 /* There must be exactly one destination which accepts
3412 character ch. See group_nodes_into_DFAstates. */
3413 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3416 /* j-th destination accepts the word character ch. */
3417 if (dfa->word_char[i] & mask)
3418 trtable[ch] = dest_states_word[j];
3420 trtable[ch] = dest_states[j];
3425 /* We care about whether the following character is a word
3426 character, and we are in a multi-byte character set: discern
3427 by looking at the character code: build two 256-entry
3428 transition tables, one starting at trtable[0] and one
3429 starting at trtable[SBC_MAX]. */
3430 trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3431 if (BE (trtable == NULL, 0))
3434 /* For all characters ch...: */
3435 for (i = 0; i < BITSET_UINTS; ++i)
3436 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3438 mask <<= 1, elem >>= 1, ++ch)
3439 if (BE (elem & 1, 0))
3441 /* There must be exactly one destination which accepts
3442 character ch. See group_nodes_into_DFAstates. */
3443 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3446 /* j-th destination accepts the word character ch. */
3447 trtable[ch] = dest_states[j];
3448 trtable[ch + SBC_MAX] = dest_states_word[j];
3453 if (bitset_contain (acceptable, NEWLINE_CHAR))
3455 /* The current state accepts newline character. */
3456 for (j = 0; j < ndests; ++j)
3457 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3459 /* k-th destination accepts newline character. */
3460 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3461 if (need_word_trtable)
3462 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3463 /* There must be only one destination which accepts
3464 newline. See group_nodes_into_DFAstates. */
3469 if (dest_states_malloced)
3472 re_node_set_free (&follows);
3473 for (i = 0; i < ndests; ++i)
3474 re_node_set_free (dests_node + i);
3476 if (dests_node_malloced)
3482 /* Group all nodes belonging to STATE into several destinations.
3483 Then for all destinations, set the nodes belonging to the destination
3484 to DESTS_NODE[i] and set the characters accepted by the destination
3485 to DEST_CH[i]. This function return the number of destinations. */
3489 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3490 re_node_set *dests_node, bitset *dests_ch)
3495 Idx ndests; /* Number of the destinations from `state'. */
3496 bitset accepts; /* Characters a node can accept. */
3497 const re_node_set *cur_nodes = &state->nodes;
3498 bitset_empty (accepts);
3501 /* For all the nodes belonging to `state', */
3502 for (i = 0; i < cur_nodes->nelem; ++i)
3504 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3505 re_token_type_t type = node->type;
3506 unsigned int constraint = node->constraint;
3508 /* Enumerate all single byte character this node can accept. */
3509 if (type == CHARACTER)
3510 bitset_set (accepts, node->opr.c);
3511 else if (type == SIMPLE_BRACKET)
3513 bitset_merge (accepts, node->opr.sbcset);
3515 else if (type == OP_PERIOD)
3517 #ifdef RE_ENABLE_I18N
3518 if (dfa->mb_cur_max > 1)
3519 bitset_merge (accepts, dfa->sb_char);
3522 bitset_set_all (accepts);
3523 if (!(dfa->syntax & REG_DOT_NEWLINE))
3524 bitset_clear (accepts, '\n');
3525 if (dfa->syntax & REG_DOT_NOT_NULL)
3526 bitset_clear (accepts, '\0');
3528 #ifdef RE_ENABLE_I18N
3529 else if (type == OP_UTF8_PERIOD)
3531 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3532 if (!(dfa->syntax & REG_DOT_NEWLINE))
3533 bitset_clear (accepts, '\n');
3534 if (dfa->syntax & REG_DOT_NOT_NULL)
3535 bitset_clear (accepts, '\0');
3541 /* Check the `accepts' and sift the characters which are not
3542 match it the context. */
3545 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3547 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3548 bitset_empty (accepts);
3549 if (accepts_newline)
3550 bitset_set (accepts, NEWLINE_CHAR);
3554 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3556 bitset_empty (accepts);
3560 if (constraint & NEXT_WORD_CONSTRAINT)
3562 unsigned int any_set = 0;
3563 if (type == CHARACTER && !node->word_char)
3565 bitset_empty (accepts);
3568 #ifdef RE_ENABLE_I18N
3569 if (dfa->mb_cur_max > 1)
3570 for (j = 0; j < BITSET_UINTS; ++j)
3571 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3574 for (j = 0; j < BITSET_UINTS; ++j)
3575 any_set |= (accepts[j] &= dfa->word_char[j]);
3579 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3581 unsigned int any_set = 0;
3582 if (type == CHARACTER && node->word_char)
3584 bitset_empty (accepts);
3587 #ifdef RE_ENABLE_I18N
3588 if (dfa->mb_cur_max > 1)
3589 for (j = 0; j < BITSET_UINTS; ++j)
3590 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3593 for (j = 0; j < BITSET_UINTS; ++j)
3594 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3600 /* Then divide `accepts' into DFA states, or create a new
3601 state. Above, we make sure that accepts is not empty. */
3602 for (j = 0; j < ndests; ++j)
3604 bitset intersec; /* Intersection sets, see below. */
3606 /* Flags, see below. */
3607 int has_intersec, not_subset, not_consumed;
3609 /* Optimization, skip if this state doesn't accept the character. */
3610 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3613 /* Enumerate the intersection set of this state and `accepts'. */
3615 for (k = 0; k < BITSET_UINTS; ++k)
3616 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3617 /* And skip if the intersection set is empty. */
3621 /* Then check if this state is a subset of `accepts'. */
3622 not_subset = not_consumed = 0;
3623 for (k = 0; k < BITSET_UINTS; ++k)
3625 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3626 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3629 /* If this state isn't a subset of `accepts', create a
3630 new group state, which has the `remains'. */
3633 bitset_copy (dests_ch[ndests], remains);
3634 bitset_copy (dests_ch[j], intersec);
3635 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3636 if (BE (err != REG_NOERROR, 0))
3641 /* Put the position in the current group. */
3642 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3643 if (BE (result < 0, 0))
3646 /* If all characters are consumed, go to next node. */
3650 /* Some characters remain, create a new group. */
3653 bitset_copy (dests_ch[ndests], accepts);
3654 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3655 if (BE (err != REG_NOERROR, 0))
3658 bitset_empty (accepts);
3663 for (j = 0; j < ndests; ++j)
3664 re_node_set_free (dests_node + j);
3668 #ifdef RE_ENABLE_I18N
3669 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3670 Return the number of the bytes the node accepts.
3671 STR_IDX is the current index of the input string.
3673 This function handles the nodes which can accept one character, or
3674 one collating element like '.', '[a-z]', opposite to the other nodes
3675 can only accept one byte. */
3679 check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3680 const re_string_t *input, Idx str_idx)
3682 const re_token_t *node = dfa->nodes + node_idx;
3683 int char_len, elem_len;
3686 if (BE (node->type == OP_UTF8_PERIOD, 0))
3688 unsigned char c = re_string_byte_at (input, str_idx), d;
3689 if (BE (c < 0xc2, 1))
3692 if (str_idx + 2 > input->len)
3695 d = re_string_byte_at (input, str_idx + 1);
3697 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3701 if (c == 0xe0 && d < 0xa0)
3707 if (c == 0xf0 && d < 0x90)
3713 if (c == 0xf8 && d < 0x88)
3719 if (c == 0xfc && d < 0x84)
3725 if (str_idx + char_len > input->len)
3728 for (i = 1; i < char_len; ++i)
3730 d = re_string_byte_at (input, str_idx + i);
3731 if (d < 0x80 || d > 0xbf)
3737 char_len = re_string_char_size_at (input, str_idx);
3738 if (node->type == OP_PERIOD)
3742 /* FIXME: I don't think this if is needed, as both '\n'
3743 and '\0' are char_len == 1. */
3744 /* '.' accepts any one character except the following two cases. */
3745 if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3746 re_string_byte_at (input, str_idx) == '\n') ||
3747 ((dfa->syntax & REG_DOT_NOT_NULL) &&
3748 re_string_byte_at (input, str_idx) == '\0'))
3753 elem_len = re_string_elem_size_at (input, str_idx);
3754 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3757 if (node->type == COMPLEX_BRACKET)
3759 const re_charset_t *cset = node->opr.mbcset;
3761 const unsigned char *pin
3762 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3767 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3768 ? re_string_wchar_at (input, str_idx) : 0);
3770 /* match with multibyte character? */
3771 for (i = 0; i < cset->nmbchars; ++i)
3772 if (wc == cset->mbchars[i])
3774 match_len = char_len;
3775 goto check_node_accept_bytes_match;
3777 /* match with character_class? */
3778 for (i = 0; i < cset->nchar_classes; ++i)
3780 wctype_t wt = cset->char_classes[i];
3781 if (__iswctype (wc, wt))
3783 match_len = char_len;
3784 goto check_node_accept_bytes_match;
3789 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3792 unsigned int in_collseq = 0;
3793 const int32_t *table, *indirect;
3794 const unsigned char *weights, *extra;
3795 const char *collseqwc;
3797 /* This #include defines a local function! */
3798 # include <locale/weight.h>
3800 /* match with collating_symbol? */
3801 if (cset->ncoll_syms)
3802 extra = (const unsigned char *)
3803 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3804 for (i = 0; i < cset->ncoll_syms; ++i)
3806 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3807 /* Compare the length of input collating element and
3808 the length of current collating element. */
3809 if (*coll_sym != elem_len)
3811 /* Compare each bytes. */
3812 for (j = 0; j < *coll_sym; j++)
3813 if (pin[j] != coll_sym[1 + j])
3817 /* Match if every bytes is equal. */
3819 goto check_node_accept_bytes_match;
3825 if (elem_len <= char_len)
3827 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3828 in_collseq = __collseq_table_lookup (collseqwc, wc);
3831 in_collseq = find_collation_sequence_value (pin, elem_len);
3833 /* match with range expression? */
3834 for (i = 0; i < cset->nranges; ++i)
3835 if (cset->range_starts[i] <= in_collseq
3836 && in_collseq <= cset->range_ends[i])
3838 match_len = elem_len;
3839 goto check_node_accept_bytes_match;
3842 /* match with equivalence_class? */
3843 if (cset->nequiv_classes)
3845 const unsigned char *cp = pin;
3846 table = (const int32_t *)
3847 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3848 weights = (const unsigned char *)
3849 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3850 extra = (const unsigned char *)
3851 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3852 indirect = (const int32_t *)
3853 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3854 idx = findidx (&cp);
3856 for (i = 0; i < cset->nequiv_classes; ++i)
3858 int32_t equiv_class_idx = cset->equiv_classes[i];
3859 size_t weight_len = weights[idx];
3860 if (weight_len == weights[equiv_class_idx])
3863 while (cnt <= weight_len
3864 && (weights[equiv_class_idx + 1 + cnt]
3865 == weights[idx + 1 + cnt]))
3867 if (cnt > weight_len)
3869 match_len = elem_len;
3870 goto check_node_accept_bytes_match;
3879 /* match with range expression? */
3881 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3883 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3886 for (i = 0; i < cset->nranges; ++i)
3888 cmp_buf[0] = cset->range_starts[i];
3889 cmp_buf[4] = cset->range_ends[i];
3890 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3891 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3893 match_len = char_len;
3894 goto check_node_accept_bytes_match;
3898 check_node_accept_bytes_match:
3899 if (!cset->non_match)
3906 return (elem_len > char_len) ? elem_len : char_len;
3914 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3916 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3921 /* No valid character. Match it as a single byte character. */
3922 const unsigned char *collseq = (const unsigned char *)
3923 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3924 return collseq[mbs[0]];
3931 const unsigned char *extra = (const unsigned char *)
3932 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3933 int32_t extrasize = (const unsigned char *)
3934 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3936 for (idx = 0; idx < extrasize;)
3938 int mbs_cnt, found = 0;
3939 int32_t elem_mbs_len;
3940 /* Skip the name of collating element name. */
3941 idx = idx + extra[idx] + 1;
3942 elem_mbs_len = extra[idx++];
3943 if (mbs_len == elem_mbs_len)
3945 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3946 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3948 if (mbs_cnt == elem_mbs_len)
3949 /* Found the entry. */
3952 /* Skip the byte sequence of the collating element. */
3953 idx += elem_mbs_len;
3954 /* Adjust for the alignment. */
3955 idx = (idx + 3) & ~3;
3956 /* Skip the collation sequence value. */
3957 idx += sizeof (uint32_t);
3958 /* Skip the wide char sequence of the collating element. */
3959 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3960 /* If we found the entry, return the sequence value. */
3962 return *(uint32_t *) (extra + idx);
3963 /* Skip the collation sequence value. */
3964 idx += sizeof (uint32_t);
3970 #endif /* RE_ENABLE_I18N */
3972 /* Check whether the node accepts the byte which is IDX-th
3973 byte of the INPUT. */
3977 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3981 ch = re_string_byte_at (&mctx->input, idx);
3985 if (node->opr.c != ch)
3989 case SIMPLE_BRACKET:
3990 if (!bitset_contain (node->opr.sbcset, ch))
3994 #ifdef RE_ENABLE_I18N
3995 case OP_UTF8_PERIOD:
4001 if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4002 || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4010 if (node->constraint)
4012 /* The node has constraints. Check whether the current context
4013 satisfies the constraints. */
4014 unsigned int context = re_string_context_at (&mctx->input, idx,
4016 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4023 /* Extend the buffers, if the buffers have run out. */
4025 static reg_errcode_t
4027 extend_buffers (re_match_context_t *mctx)
4030 re_string_t *pstr = &mctx->input;
4032 /* Double the lengthes of the buffers. */
4033 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4034 if (BE (ret != REG_NOERROR, 0))
4037 if (mctx->state_log != NULL)
4039 /* And double the length of state_log. */
4040 /* XXX We have no indication of the size of this buffer. If this
4041 allocation fail we have no indication that the state_log array
4042 does not have the right size. */
4043 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4044 pstr->bufs_len + 1);
4045 if (BE (new_array == NULL, 0))
4047 mctx->state_log = new_array;
4050 /* Then reconstruct the buffers. */
4053 #ifdef RE_ENABLE_I18N
4054 if (pstr->mb_cur_max > 1)
4056 ret = build_wcs_upper_buffer (pstr);
4057 if (BE (ret != REG_NOERROR, 0))
4061 #endif /* RE_ENABLE_I18N */
4062 build_upper_buffer (pstr);
4066 #ifdef RE_ENABLE_I18N
4067 if (pstr->mb_cur_max > 1)
4068 build_wcs_buffer (pstr);
4070 #endif /* RE_ENABLE_I18N */
4072 if (pstr->trans != NULL)
4073 re_string_translate_buffer (pstr);
4080 /* Functions for matching context. */
4082 /* Initialize MCTX. */
4084 static reg_errcode_t
4086 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4088 mctx->eflags = eflags;
4089 mctx->match_last = REG_MISSING;
4092 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4093 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4094 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4097 /* Already zero-ed by the caller.
4099 mctx->bkref_ents = NULL;
4100 mctx->nbkref_ents = 0;
4101 mctx->nsub_tops = 0; */
4102 mctx->abkref_ents = n;
4103 mctx->max_mb_elem_len = 1;
4104 mctx->asub_tops = n;
4108 /* Clean the entries which depend on the current input in MCTX.
4109 This function must be invoked when the matcher changes the start index
4110 of the input, or changes the input string. */
4114 match_ctx_clean (re_match_context_t *mctx)
4117 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4120 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4121 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4123 re_sub_match_last_t *last = top->lasts[sl_idx];
4124 re_free (last->path.array);
4127 re_free (top->lasts);
4130 re_free (top->path->array);
4131 re_free (top->path);
4136 mctx->nsub_tops = 0;
4137 mctx->nbkref_ents = 0;
4140 /* Free all the memory associated with MCTX. */
4144 match_ctx_free (re_match_context_t *mctx)
4146 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4147 match_ctx_clean (mctx);
4148 re_free (mctx->sub_tops);
4149 re_free (mctx->bkref_ents);
4152 /* Add a new backreference entry to MCTX.
4153 Note that we assume that caller never call this function with duplicate
4154 entry, and call with STR_IDX which isn't smaller than any existing entry.
4157 static reg_errcode_t
4159 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4162 if (mctx->nbkref_ents >= mctx->abkref_ents)
4164 struct re_backref_cache_entry* new_entry;
4165 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4166 mctx->abkref_ents * 2);
4167 if (BE (new_entry == NULL, 0))
4169 re_free (mctx->bkref_ents);
4172 mctx->bkref_ents = new_entry;
4173 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4174 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4175 mctx->abkref_ents *= 2;
4177 if (mctx->nbkref_ents > 0
4178 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4179 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4181 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4182 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4183 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4184 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4186 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4187 If bit N is clear, means that this entry won't epsilon-transition to
4188 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4189 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4192 A backreference does not epsilon-transition unless it is empty, so set
4193 to all zeros if FROM != TO. */
4194 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4195 = (from == to ? -1 : 0);
4197 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4198 if (mctx->max_mb_elem_len < to - from)
4199 mctx->max_mb_elem_len = to - from;
4203 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4204 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4208 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4210 Idx left, right, mid, last;
4211 last = right = mctx->nbkref_ents;
4212 for (left = 0; left < right;)
4214 mid = (left + right) / 2;
4215 if (mctx->bkref_ents[mid].str_idx < str_idx)
4220 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4226 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4229 static reg_errcode_t
4231 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4234 assert (mctx->sub_tops != NULL);
4235 assert (mctx->asub_tops > 0);
4237 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4239 Idx new_asub_tops = mctx->asub_tops * 2;
4240 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4241 re_sub_match_top_t *,
4243 if (BE (new_array == NULL, 0))
4245 mctx->sub_tops = new_array;
4246 mctx->asub_tops = new_asub_tops;
4248 mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4249 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4251 mctx->sub_tops[mctx->nsub_tops]->node = node;
4252 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4256 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4257 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4259 static re_sub_match_last_t *
4261 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4263 re_sub_match_last_t *new_entry;
4264 if (BE (subtop->nlasts == subtop->alasts, 0))
4266 Idx new_alasts = 2 * subtop->alasts + 1;
4267 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4268 re_sub_match_last_t *,
4270 if (BE (new_array == NULL, 0))
4272 subtop->lasts = new_array;
4273 subtop->alasts = new_alasts;
4275 new_entry = re_calloc (re_sub_match_last_t, 1);
4276 if (BE (new_entry != NULL, 1))
4278 subtop->lasts[subtop->nlasts] = new_entry;
4279 new_entry->node = node;
4280 new_entry->str_idx = str_idx;
4288 sift_ctx_init (re_sift_context_t *sctx,
4289 re_dfastate_t **sifted_sts,
4290 re_dfastate_t **limited_sts,
4291 Idx last_node, Idx last_str_idx)
4293 sctx->sifted_states = sifted_sts;
4294 sctx->limited_states = limited_sts;
4295 sctx->last_node = last_node;
4296 sctx->last_str_idx = last_str_idx;
4297 re_node_set_init_empty (&sctx->limits);