1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 int n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
25 int str_idx, int from, int to)
27 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
30 int str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 int node, int str_idx)
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, int last_node,
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, int length,
40 int start, int range, int stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, int length1,
45 const char *string2, int length2,
46 int start, int range, struct re_registers *regs,
47 int stop, int ret_len) internal_function;
48 static int re_search_stub (struct re_pattern_buffer *bufp,
49 const char *string, int length, int start,
50 int range, int stop, struct re_registers *regs,
51 int ret_len) internal_function;
52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
53 int nregs, int regs_allocated) internal_function;
54 static inline re_dfastate_t *acquire_init_state_context
55 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
56 __attribute ((always_inline)) internal_function;
57 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
59 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
62 static int check_halt_node_context (const re_dfa_t *dfa, int node,
63 unsigned int context) internal_function;
64 static int check_halt_state_context (const re_match_context_t *mctx,
65 const re_dfastate_t *state, int idx)
67 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
68 regmatch_t *prev_idx_match, int cur_node,
69 int cur_idx, int nmatch) internal_function;
70 static int proceed_next_node (const re_match_context_t *mctx,
71 int nregs, regmatch_t *regs,
72 int *pidx, int node, re_node_set *eps_via_nodes,
73 struct re_fail_stack_t *fs) internal_function;
74 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
75 int str_idx, int dest_node, int nregs,
77 re_node_set *eps_via_nodes) internal_function;
78 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
79 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
80 static reg_errcode_t set_regs (const regex_t *preg,
81 const re_match_context_t *mctx,
82 size_t nmatch, regmatch_t *pmatch,
83 int fl_backtrack) internal_function;
84 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
87 static int sift_states_iter_mb (const re_match_context_t *mctx,
88 re_sift_context_t *sctx,
89 int node_idx, int str_idx, int max_str_idx) internal_function;
90 #endif /* RE_ENABLE_I18N */
91 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
92 re_sift_context_t *sctx) internal_function;
93 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
94 re_sift_context_t *sctx, int str_idx,
95 re_node_set *cur_dest) internal_function;
96 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
97 re_sift_context_t *sctx,
99 re_node_set *dest_nodes) internal_function;
100 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
101 re_node_set *dest_nodes,
102 const re_node_set *candidates) internal_function;
103 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
104 re_node_set *dest_nodes,
105 const re_node_set *and_nodes) internal_function;
106 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
107 int dst_node, int dst_idx, int src_node,
108 int src_idx) internal_function;
109 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
110 int boundaries, int subexp_idx,
111 int from_node, int bkref_idx) internal_function;
112 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
113 int limit, int subexp_idx,
114 int node, int str_idx,
115 int bkref_idx) internal_function;
116 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
117 re_node_set *dest_nodes,
118 const re_node_set *candidates,
120 struct re_backref_cache_entry *bkref_ents,
121 int str_idx) internal_function;
122 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
123 re_sift_context_t *sctx,
124 int str_idx, const re_node_set *candidates) internal_function;
125 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
126 int next_state_log_idx) internal_function;
127 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
128 re_dfastate_t **src, int num) internal_function;
129 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
130 re_match_context_t *mctx) internal_function;
131 static re_dfastate_t *transit_state (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *state) internal_function;
134 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
135 re_match_context_t *mctx,
136 re_dfastate_t *next_state) internal_function;
137 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
138 re_node_set *cur_nodes,
139 int str_idx) internal_function;
141 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
142 re_match_context_t *mctx,
143 re_dfastate_t *pstate) internal_function;
145 #ifdef RE_ENABLE_I18N
146 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
147 re_dfastate_t *pstate) internal_function;
148 #endif /* RE_ENABLE_I18N */
149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
150 const re_node_set *nodes) internal_function;
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx) internal_function;
153 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
154 const re_sub_match_top_t *sub_top,
155 re_sub_match_last_t *sub_last,
156 int bkref_node, int bkref_str) internal_function;
157 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
158 int subexp_idx, int type) internal_function;
159 static reg_errcode_t check_arrival (re_match_context_t *mctx,
160 state_array_t *path, int top_node,
161 int top_str, int last_node, int last_str,
162 int type) internal_function;
163 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 re_node_set *cur_nodes,
166 re_node_set *next_nodes) internal_function;
167 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
168 re_node_set *cur_nodes,
169 int ex_subexp, int type) internal_function;
170 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
171 re_node_set *dst_nodes,
172 int target, int ex_subexp,
173 int type) internal_function;
174 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
175 re_node_set *cur_nodes, int cur_str,
176 int subexp_num, int type) internal_function;
177 static int build_trtable (re_dfa_t *dfa,
178 re_dfastate_t *state) internal_function;
179 #ifdef RE_ENABLE_I18N
180 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
181 const re_string_t *input, int idx) internal_function;
183 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
184 size_t name_len) internal_function;
186 #endif /* RE_ENABLE_I18N */
187 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
188 const re_dfastate_t *state,
189 re_node_set *states_node,
190 bitset *states_ch) internal_function;
191 static int check_node_accept (const re_match_context_t *mctx,
192 const re_token_t *node, int idx) internal_function;
193 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
195 /* Entry point for POSIX code. */
197 /* regexec searches for a given pattern, specified by PREG, in the
200 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
201 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
202 least NMATCH elements, and we set them to the offsets of the
203 corresponding matched substrings.
205 EFLAGS specifies `execution flags' which affect matching: if
206 REG_NOTBOL is set, then ^ does not match at the beginning of the
207 string; if REG_NOTEOL is set, then $ does not match at the end.
209 We return 0 if we find a match and REG_NOMATCH if not. */
212 regexec (const regex_t *__restrict preg, const char *__restrict string,
213 size_t nmatch, regmatch_t pmatch[], int eflags)
218 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
221 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
224 if (eflags & REG_STARTEND)
226 start = pmatch[0].rm_so;
227 length = pmatch[0].rm_eo;
232 length = strlen (string);
235 __libc_lock_lock (dfa->lock);
237 err = re_search_internal (preg, string, length, start, length - start,
238 length, 0, NULL, eflags);
240 err = re_search_internal (preg, string, length, start, length - start,
241 length, nmatch, pmatch, eflags);
242 __libc_lock_unlock (dfa->lock);
243 return err != REG_NOERROR;
247 # include <shlib-compat.h>
248 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
250 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
251 __typeof__ (__regexec) __compat_regexec;
254 attribute_compat_text_section
255 __compat_regexec (const regex_t *__restrict preg,
256 const char *__restrict string, size_t nmatch,
257 regmatch_t pmatch[], int eflags)
259 return regexec (preg, string, nmatch, pmatch,
260 eflags & (REG_NOTBOL | REG_NOTEOL));
262 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
266 /* Entry points for GNU code. */
268 /* re_match, re_search, re_match_2, re_search_2
270 The former two functions operate on STRING with length LENGTH,
271 while the later two operate on concatenation of STRING1 and STRING2
272 with lengths LENGTH1 and LENGTH2, respectively.
274 re_match() matches the compiled pattern in BUFP against the string,
275 starting at index START.
277 re_search() first tries matching at index START, then it tries to match
278 starting from index START + 1, and so on. The last start position tried
279 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
282 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
283 the first STOP characters of the concatenation of the strings should be
286 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
287 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
288 computed relative to the concatenation, not relative to the individual
291 On success, re_match* functions return the length of the match, re_search*
292 return the position of the start of the match. Return value -1 means no
293 match was found and -2 indicates an internal error. */
296 re_match (struct re_pattern_buffer *bufp, const char *string,
297 int length, int start, struct re_registers *regs)
299 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
302 weak_alias (__re_match, re_match)
306 re_search (struct re_pattern_buffer *bufp, const char *string,
307 int length, int start, int range, struct re_registers *regs)
309 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
312 weak_alias (__re_search, re_search)
316 re_match_2 (struct re_pattern_buffer *bufp,
317 const char *string1, int length1,
318 const char *string2, int length2,
319 int start, struct re_registers *regs, int stop)
321 return re_search_2_stub (bufp, string1, length1, string2, length2,
322 start, 0, regs, stop, 1);
325 weak_alias (__re_match_2, re_match_2)
329 re_search_2 (struct re_pattern_buffer *bufp,
330 const char *string1, int length1,
331 const char *string2, int length2,
332 int start, int range, struct re_registers *regs, int stop)
334 return re_search_2_stub (bufp, string1, length1, string2, length2,
335 start, range, regs, stop, 0);
338 weak_alias (__re_search_2, re_search_2)
343 re_search_2_stub (struct re_pattern_buffer *bufp,
344 const char *string1, int length1,
345 const char *string2, int length2,
346 int start, int range, struct re_registers *regs, int stop,
351 int len = length1 + length2;
354 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
357 /* Concatenate the strings. */
361 char *s = re_malloc (char, len);
363 if (BE (s == NULL, 0))
365 memcpy (s, string1, length1);
366 memcpy (s + length1, string2, length2);
375 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
378 re_free ((char *) str);
382 /* The parameters have the same meaning as those of re_search.
383 Additional parameters:
384 If RET_LEN is nonzero the length of the match is returned (re_match style);
385 otherwise the position of the match is returned. */
389 re_search_stub (struct re_pattern_buffer *bufp,
390 const char *string, int length,
391 int start, int range, int stop, struct re_registers *regs,
394 reg_errcode_t result;
399 re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
402 /* Check for out-of-range. */
403 if (BE (start < 0 || start > length, 0))
405 if (BE (start + range > length, 0))
406 range = length - start;
407 else if (BE (start + range < 0, 0))
410 __libc_lock_lock (dfa->lock);
412 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
413 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
415 /* Compile fastmap if we haven't yet. */
416 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
417 re_compile_fastmap (bufp);
419 if (BE (bufp->no_sub, 0))
422 /* We need at least 1 register. */
425 else if (BE (bufp->regs_allocated == REGS_FIXED &&
426 regs->num_regs < bufp->re_nsub + 1, 0))
428 nregs = regs->num_regs;
429 if (BE (nregs < 1, 0))
431 /* Nothing can be copied to regs. */
437 nregs = bufp->re_nsub + 1;
438 pmatch = re_malloc (regmatch_t, nregs);
439 if (BE (pmatch == NULL, 0))
445 result = re_search_internal (bufp, string, length, start, range, stop,
446 nregs, pmatch, eflags);
450 /* I hope we needn't fill ther regs with -1's when no match was found. */
451 if (result != REG_NOERROR)
453 else if (regs != NULL)
455 /* If caller wants register contents data back, copy them. */
456 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
457 bufp->regs_allocated);
458 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
462 if (BE (rval == 0, 1))
466 assert (pmatch[0].rm_so == start);
467 rval = pmatch[0].rm_eo - start;
470 rval = pmatch[0].rm_so;
474 __libc_lock_unlock (dfa->lock);
480 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
483 int rval = REGS_REALLOCATE;
485 int need_regs = nregs + 1;
486 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
489 /* Have the register data arrays been allocated? */
490 if (regs_allocated == REGS_UNALLOCATED)
491 { /* No. So allocate them with malloc. */
492 regs->start = re_malloc (regoff_t, need_regs);
493 regs->end = re_malloc (regoff_t, need_regs);
494 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
495 return REGS_UNALLOCATED;
496 regs->num_regs = need_regs;
498 else if (regs_allocated == REGS_REALLOCATE)
499 { /* Yes. If we need more elements than were already
500 allocated, reallocate them. If we need fewer, just
502 if (BE (need_regs > regs->num_regs, 0))
504 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
505 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
506 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
507 return REGS_UNALLOCATED;
508 regs->start = new_start;
510 regs->num_regs = need_regs;
515 assert (regs_allocated == REGS_FIXED);
516 /* This function may not be called with REGS_FIXED and nregs too big. */
517 assert (regs->num_regs >= nregs);
522 for (i = 0; i < nregs; ++i)
524 regs->start[i] = pmatch[i].rm_so;
525 regs->end[i] = pmatch[i].rm_eo;
527 for ( ; i < regs->num_regs; ++i)
528 regs->start[i] = regs->end[i] = -1;
533 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
534 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
535 this memory for recording register information. STARTS and ENDS
536 must be allocated using the malloc library routine, and must each
537 be at least NUM_REGS * sizeof (regoff_t) bytes long.
539 If NUM_REGS == 0, then subsequent matches should allocate their own
542 Unless this function is called, the first search or match using
543 PATTERN_BUFFER will allocate its own register data, without
544 freeing the old data. */
547 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
548 unsigned int num_regs, regoff_t *starts, regoff_t *ends)
552 bufp->regs_allocated = REGS_REALLOCATE;
553 regs->num_regs = num_regs;
554 regs->start = starts;
559 bufp->regs_allocated = REGS_UNALLOCATED;
561 regs->start = regs->end = (regoff_t *) 0;
565 weak_alias (__re_set_registers, re_set_registers)
568 /* Entry points compatible with 4.2 BSD regex library. We don't define
569 them unless specifically requested. */
571 #if defined _REGEX_RE_COMP || defined _LIBC
579 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
581 #endif /* _REGEX_RE_COMP */
583 /* Internal entry point. */
585 /* Searches for a compiled pattern PREG in the string STRING, whose
586 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
587 mingings with regexec. START, and RANGE have the same meanings
589 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
590 otherwise return the error code.
591 Note: We assume front end functions already check ranges.
592 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
596 re_search_internal (const regex_t *preg,
597 const char *string, int length,
598 int start, int range, int stop,
599 size_t nmatch, regmatch_t pmatch[],
603 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
604 int left_lim, right_lim, incr;
605 int fl_longest_match, match_first, match_kind, match_last = -1;
608 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
609 re_match_context_t mctx = { .dfa = dfa };
611 re_match_context_t mctx;
613 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
614 && range && !preg->can_be_null) ? preg->fastmap : NULL;
615 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
617 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
618 memset (&mctx, '\0', sizeof (re_match_context_t));
622 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
623 nmatch -= extra_nmatch;
625 /* Check if the DFA haven't been compiled. */
626 if (BE (preg->used == 0 || dfa->init_state == NULL
627 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
628 || dfa->init_state_begbuf == NULL, 0))
632 /* We assume front-end functions already check them. */
633 assert (start + range >= 0 && start + range <= length);
636 /* If initial states with non-begbuf contexts have no elements,
637 the regex must be anchored. If preg->newline_anchor is set,
638 we'll never use init_state_nl, so do not check it. */
639 if (dfa->init_state->nodes.nelem == 0
640 && dfa->init_state_word->nodes.nelem == 0
641 && (dfa->init_state_nl->nodes.nelem == 0
642 || !preg->newline_anchor))
644 if (start != 0 && start + range != 0)
649 /* We must check the longest matching, if nmatch > 0. */
650 fl_longest_match = (nmatch != 0 || dfa->nbackref);
652 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
653 preg->translate, preg->syntax & RE_ICASE, dfa);
654 if (BE (err != REG_NOERROR, 0))
656 mctx.input.stop = stop;
657 mctx.input.raw_stop = stop;
658 mctx.input.newline_anchor = preg->newline_anchor;
660 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
661 if (BE (err != REG_NOERROR, 0))
664 /* We will log all the DFA states through which the dfa pass,
665 if nmatch > 1, or this dfa has "multibyte node", which is a
666 back-reference or a node which can accept multibyte character or
667 multi character collating element. */
668 if (nmatch > 1 || dfa->has_mb_node)
670 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
671 if (BE (mctx.state_log == NULL, 0))
678 mctx.state_log = NULL;
681 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
682 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
684 /* Check incrementally whether of not the input string match. */
685 incr = (range < 0) ? -1 : 1;
686 left_lim = (range < 0) ? start + range : start;
687 right_lim = (range < 0) ? start : start + range;
688 sb = dfa->mb_cur_max == 1;
691 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
692 | (range >= 0 ? 2 : 0)
693 | (t != NULL ? 1 : 0))
696 for (;; match_first += incr)
699 if (match_first < left_lim || right_lim < match_first)
702 /* Advance as rapidly as possible through the string, until we
703 find a plausible place to start matching. This may be done
704 with varying efficiency, so there are various possibilities:
705 only the most common of them are specialized, in order to
706 save on code size. We use a switch statement for speed. */
714 /* Fastmap with single-byte translation, match forward. */
715 while (BE (match_first < right_lim, 1)
716 && !fastmap[t[(unsigned char) string[match_first]]])
718 goto forward_match_found_start_or_reached_end;
721 /* Fastmap without translation, match forward. */
722 while (BE (match_first < right_lim, 1)
723 && !fastmap[(unsigned char) string[match_first]])
726 forward_match_found_start_or_reached_end:
727 if (BE (match_first == right_lim, 0))
729 ch = match_first >= length
730 ? 0 : (unsigned char) string[match_first];
731 if (!fastmap[t ? t[ch] : ch])
738 /* Fastmap without multi-byte translation, match backwards. */
739 while (match_first >= left_lim)
741 ch = match_first >= length
742 ? 0 : (unsigned char) string[match_first];
743 if (fastmap[t ? t[ch] : ch])
747 if (match_first < left_lim)
752 /* In this case, we can't determine easily the current byte,
753 since it might be a component byte of a multibyte
754 character. Then we use the constructed buffer instead. */
757 /* If MATCH_FIRST is out of the valid range, reconstruct the
759 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
760 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
762 err = re_string_reconstruct (&mctx.input, match_first,
764 if (BE (err != REG_NOERROR, 0))
767 offset = match_first - mctx.input.raw_mbs_idx;
769 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
770 Note that MATCH_FIRST must not be smaller than 0. */
771 ch = (match_first >= length
772 ? 0 : re_string_byte_at (&mctx.input, offset));
776 if (match_first < left_lim || match_first > right_lim)
785 /* Reconstruct the buffers so that the matcher can assume that
786 the matching starts from the beginning of the buffer. */
787 err = re_string_reconstruct (&mctx.input, match_first, eflags);
788 if (BE (err != REG_NOERROR, 0))
791 #ifdef RE_ENABLE_I18N
792 /* Don't consider this char as a possible match start if it part,
793 yet isn't the head, of a multibyte character. */
794 if (!sb && !re_string_first_byte (&mctx.input, 0))
798 /* It seems to be appropriate one, then use the matcher. */
799 /* We assume that the matching starts from 0. */
800 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
801 match_last = check_matching (&mctx, fl_longest_match,
802 range >= 0 ? &match_first : NULL);
803 if (match_last != -1)
805 if (BE (match_last == -2, 0))
812 mctx.match_last = match_last;
813 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
815 re_dfastate_t *pstate = mctx.state_log[match_last];
816 mctx.last_node = check_halt_state_context (&mctx, pstate,
819 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
822 err = prune_impossible_nodes (&mctx);
823 if (err == REG_NOERROR)
825 if (BE (err != REG_NOMATCH, 0))
830 break; /* We found a match. */
834 match_ctx_clean (&mctx);
838 assert (match_last != -1);
839 assert (err == REG_NOERROR);
842 /* Set pmatch[] if we need. */
847 /* Initialize registers. */
848 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
849 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
851 /* Set the points where matching start/end. */
853 pmatch[0].rm_eo = mctx.match_last;
855 if (!preg->no_sub && nmatch > 1)
857 err = set_regs (preg, &mctx, nmatch, pmatch,
858 dfa->has_plural_match && dfa->nbackref > 0);
859 if (BE (err != REG_NOERROR, 0))
863 /* At last, add the offset to the each registers, since we slided
864 the buffers so that we could assume that the matching starts
866 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
867 if (pmatch[reg_idx].rm_so != -1)
869 #ifdef RE_ENABLE_I18N
870 if (BE (mctx.input.offsets_needed != 0, 0))
872 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
873 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
875 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
876 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
877 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
879 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
882 assert (mctx.input.offsets_needed == 0);
884 pmatch[reg_idx].rm_so += match_first;
885 pmatch[reg_idx].rm_eo += match_first;
887 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
889 pmatch[nmatch + reg_idx].rm_so = -1;
890 pmatch[nmatch + reg_idx].rm_eo = -1;
894 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
895 if (dfa->subexp_map[reg_idx] != reg_idx)
897 pmatch[reg_idx + 1].rm_so
898 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
899 pmatch[reg_idx + 1].rm_eo
900 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
905 re_free (mctx.state_log);
907 match_ctx_free (&mctx);
908 re_string_destruct (&mctx.input);
914 prune_impossible_nodes (re_match_context_t *mctx)
916 re_dfa_t *const dfa = mctx->dfa;
917 int halt_node, match_last;
919 re_dfastate_t **sifted_states;
920 re_dfastate_t **lim_states = NULL;
921 re_sift_context_t sctx;
923 assert (mctx->state_log != NULL);
925 match_last = mctx->match_last;
926 halt_node = mctx->last_node;
927 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
928 if (BE (sifted_states == NULL, 0))
935 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
936 if (BE (lim_states == NULL, 0))
943 memset (lim_states, '\0',
944 sizeof (re_dfastate_t *) * (match_last + 1));
945 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
947 ret = sift_states_backward (mctx, &sctx);
948 re_node_set_free (&sctx.limits);
949 if (BE (ret != REG_NOERROR, 0))
951 if (sifted_states[0] != NULL || lim_states[0] != NULL)
961 } while (mctx->state_log[match_last] == NULL
962 || !mctx->state_log[match_last]->halt);
963 halt_node = check_halt_state_context (mctx,
964 mctx->state_log[match_last],
967 ret = merge_state_array (dfa, sifted_states, lim_states,
969 re_free (lim_states);
971 if (BE (ret != REG_NOERROR, 0))
976 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
977 ret = sift_states_backward (mctx, &sctx);
978 re_node_set_free (&sctx.limits);
979 if (BE (ret != REG_NOERROR, 0))
982 re_free (mctx->state_log);
983 mctx->state_log = sifted_states;
984 sifted_states = NULL;
985 mctx->last_node = halt_node;
986 mctx->match_last = match_last;
989 re_free (sifted_states);
990 re_free (lim_states);
994 /* Acquire an initial state and return it.
995 We must select appropriate initial state depending on the context,
996 since initial states may have constraints like "\<", "^", etc.. */
998 static inline re_dfastate_t *
1000 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1003 re_dfa_t *const dfa = mctx->dfa;
1004 if (dfa->init_state->has_constraint)
1006 unsigned int context;
1007 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1008 if (IS_WORD_CONTEXT (context))
1009 return dfa->init_state_word;
1010 else if (IS_ORDINARY_CONTEXT (context))
1011 return dfa->init_state;
1012 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1013 return dfa->init_state_begbuf;
1014 else if (IS_NEWLINE_CONTEXT (context))
1015 return dfa->init_state_nl;
1016 else if (IS_BEGBUF_CONTEXT (context))
1018 /* It is relatively rare case, then calculate on demand. */
1019 return re_acquire_state_context (err, dfa,
1020 dfa->init_state->entrance_nodes,
1024 /* Must not happen? */
1025 return dfa->init_state;
1028 return dfa->init_state;
1031 /* Check whether the regular expression match input string INPUT or not,
1032 and return the index where the matching end, return -1 if not match,
1033 or return -2 in case of an error.
1034 FL_LONGEST_MATCH means we want the POSIX longest matching.
1035 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1036 next place where we may want to try matching.
1037 Note that the matcher assume that the maching starts from the current
1038 index of the buffer. */
1042 check_matching (re_match_context_t *mctx, int fl_longest_match,
1045 re_dfa_t *const dfa = mctx->dfa;
1048 int match_last = -1;
1049 int cur_str_idx = re_string_cur_idx (&mctx->input);
1050 re_dfastate_t *cur_state;
1051 int at_init_state = p_match_first != NULL;
1052 int next_start_idx = cur_str_idx;
1055 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1056 /* An initial state must not be NULL (invalid). */
1057 if (BE (cur_state == NULL, 0))
1059 assert (err == REG_ESPACE);
1063 if (mctx->state_log != NULL)
1065 mctx->state_log[cur_str_idx] = cur_state;
1067 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1068 later. E.g. Processing back references. */
1069 if (BE (dfa->nbackref, 0))
1072 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1073 if (BE (err != REG_NOERROR, 0))
1076 if (cur_state->has_backref)
1078 err = transit_state_bkref (mctx, &cur_state->nodes);
1079 if (BE (err != REG_NOERROR, 0))
1085 /* If the RE accepts NULL string. */
1086 if (BE (cur_state->halt, 0))
1088 if (!cur_state->has_constraint
1089 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1091 if (!fl_longest_match)
1095 match_last = cur_str_idx;
1101 while (!re_string_eoi (&mctx->input))
1103 re_dfastate_t *old_state = cur_state;
1104 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1106 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1107 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1108 && mctx->input.valid_len < mctx->input.len))
1110 err = extend_buffers (mctx);
1111 if (BE (err != REG_NOERROR, 0))
1113 assert (err == REG_ESPACE);
1118 cur_state = transit_state (&err, mctx, cur_state);
1119 if (mctx->state_log != NULL)
1120 cur_state = merge_state_with_log (&err, mctx, cur_state);
1122 if (cur_state == NULL)
1124 /* Reached the invalid state or an error. Try to recover a valid
1125 state using the state log, if available and if we have not
1126 already found a valid (even if not the longest) match. */
1127 if (BE (err != REG_NOERROR, 0))
1130 if (mctx->state_log == NULL
1131 || (match && !fl_longest_match)
1132 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1136 if (BE (at_init_state, 0))
1138 if (old_state == cur_state)
1139 next_start_idx = next_char_idx;
1144 if (cur_state->halt)
1146 /* Reached a halt state.
1147 Check the halt state can satisfy the current context. */
1148 if (!cur_state->has_constraint
1149 || check_halt_state_context (mctx, cur_state,
1150 re_string_cur_idx (&mctx->input)))
1152 /* We found an appropriate halt state. */
1153 match_last = re_string_cur_idx (&mctx->input);
1156 /* We found a match, do not modify match_first below. */
1157 p_match_first = NULL;
1158 if (!fl_longest_match)
1165 *p_match_first += next_start_idx;
1170 /* Check NODE match the current context. */
1174 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1176 re_token_type_t type = dfa->nodes[node].type;
1177 unsigned int constraint = dfa->nodes[node].constraint;
1178 if (type != END_OF_RE)
1182 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1187 /* Check the halt state STATE match the current context.
1188 Return 0 if not match, if the node, STATE has, is a halt node and
1189 match the context, return the node. */
1193 check_halt_state_context (const re_match_context_t *mctx,
1194 const re_dfastate_t *state, int idx)
1197 unsigned int context;
1199 assert (state->halt);
1201 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1202 for (i = 0; i < state->nodes.nelem; ++i)
1203 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1204 return state->nodes.elems[i];
1208 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1209 corresponding to the DFA).
1210 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1215 proceed_next_node (const re_match_context_t *mctx,
1216 int nregs, regmatch_t *regs, int *pidx, int node,
1217 re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1219 re_dfa_t *const dfa = mctx->dfa;
1221 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1223 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1224 re_node_set *edests = &dfa->edests[node];
1226 err = re_node_set_insert (eps_via_nodes, node);
1227 if (BE (err < 0, 0))
1229 /* Pick up a valid destination, or return -1 if none is found. */
1230 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1232 int candidate = edests->elems[i];
1233 if (!re_node_set_contains (cur_nodes, candidate))
1235 if (dest_node == -1)
1236 dest_node = candidate;
1240 /* In order to avoid infinite loop like "(a*)*", return the second
1241 epsilon-transition if the first was already considered. */
1242 if (re_node_set_contains (eps_via_nodes, dest_node))
1245 /* Otherwise, push the second epsilon-transition on the fail stack. */
1247 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1251 /* We know we are going to exit. */
1260 re_token_type_t type = dfa->nodes[node].type;
1262 #ifdef RE_ENABLE_I18N
1263 if (dfa->nodes[node].accept_mb)
1264 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1266 #endif /* RE_ENABLE_I18N */
1267 if (type == OP_BACK_REF)
1269 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1270 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1273 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1277 char *buf = (char *) re_string_get_buffer (&mctx->input);
1278 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1287 err = re_node_set_insert (eps_via_nodes, node);
1288 if (BE (err < 0, 0))
1290 dest_node = dfa->edests[node].elems[0];
1291 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1298 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1300 int dest_node = dfa->nexts[node];
1301 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1302 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1303 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1306 re_node_set_empty (eps_via_nodes);
1313 static reg_errcode_t
1315 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1316 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1319 int num = fs->num++;
1320 if (fs->num == fs->alloc)
1322 struct re_fail_stack_ent_t *new_array;
1323 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1325 if (new_array == NULL)
1328 fs->stack = new_array;
1330 fs->stack[num].idx = str_idx;
1331 fs->stack[num].node = dest_node;
1332 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1333 if (fs->stack[num].regs == NULL)
1335 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1336 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1342 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx,
1343 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1345 int num = --fs->num;
1347 *pidx = fs->stack[num].idx;
1348 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1349 re_node_set_free (eps_via_nodes);
1350 re_free (fs->stack[num].regs);
1351 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1352 return fs->stack[num].node;
1355 /* Set the positions where the subexpressions are starts/ends to registers
1357 Note: We assume that pmatch[0] is already set, and
1358 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1360 static reg_errcode_t
1362 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1363 size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1365 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1367 re_node_set eps_via_nodes;
1368 struct re_fail_stack_t *fs;
1369 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1370 regmatch_t *prev_idx_match;
1373 assert (nmatch > 1);
1374 assert (mctx->state_log != NULL);
1379 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1380 if (fs->stack == NULL)
1386 cur_node = dfa->init_node;
1387 re_node_set_init_empty (&eps_via_nodes);
1389 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1390 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1392 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1394 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1396 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1401 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1402 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1404 if (reg_idx == nmatch)
1406 re_node_set_free (&eps_via_nodes);
1407 return free_fail_stack_return (fs);
1409 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1414 re_node_set_free (&eps_via_nodes);
1419 /* Proceed to next node. */
1420 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1421 &eps_via_nodes, fs);
1423 if (BE (cur_node < 0, 0))
1425 if (BE (cur_node == -2, 0))
1427 re_node_set_free (&eps_via_nodes);
1428 free_fail_stack_return (fs);
1432 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1436 re_node_set_free (&eps_via_nodes);
1441 re_node_set_free (&eps_via_nodes);
1442 return free_fail_stack_return (fs);
1445 static reg_errcode_t
1447 free_fail_stack_return (struct re_fail_stack_t *fs)
1452 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1454 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1455 re_free (fs->stack[fs_idx].regs);
1457 re_free (fs->stack);
1464 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1465 int cur_node, int cur_idx, int nmatch)
1467 int type = dfa->nodes[cur_node].type;
1468 if (type == OP_OPEN_SUBEXP)
1470 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1472 /* We are at the first node of this sub expression. */
1473 if (reg_num < nmatch)
1475 pmatch[reg_num].rm_so = cur_idx;
1476 pmatch[reg_num].rm_eo = -1;
1479 else if (type == OP_CLOSE_SUBEXP)
1481 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1482 if (reg_num < nmatch)
1484 /* We are at the last node of this sub expression. */
1485 if (pmatch[reg_num].rm_so < cur_idx)
1487 pmatch[reg_num].rm_eo = cur_idx;
1488 /* This is a non-empty match or we are not inside an optional
1489 subexpression. Accept this right away. */
1490 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1494 if (dfa->nodes[cur_node].opt_subexp
1495 && prev_idx_match[reg_num].rm_so != -1)
1496 /* We transited through an empty match for an optional
1497 subexpression, like (a?)*, and this is not the subexp's
1498 first match. Copy back the old content of the registers
1499 so that matches of an inner subexpression are undone as
1500 well, like in ((a?))*. */
1501 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1503 /* We completed a subexpression, but it may be part of
1504 an optional one, so do not update PREV_IDX_MATCH. */
1505 pmatch[reg_num].rm_eo = cur_idx;
1511 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1512 and sift the nodes in each states according to the following rules.
1513 Updated state_log will be wrote to STATE_LOG.
1515 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1516 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1517 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1518 the LAST_NODE, we throw away the node `a'.
1519 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1520 string `s' and transit to `b':
1521 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1523 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1524 thrown away, we throw away the node `a'.
1525 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1526 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1528 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1529 we throw away the node `a'. */
1531 #define STATE_NODE_CONTAINS(state,node) \
1532 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1534 static reg_errcode_t
1536 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1540 int str_idx = sctx->last_str_idx;
1541 re_node_set cur_dest;
1544 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1547 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1548 transit to the last_node and the last_node itself. */
1549 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1550 if (BE (err != REG_NOERROR, 0))
1552 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1553 if (BE (err != REG_NOERROR, 0))
1556 /* Then check each states in the state_log. */
1559 /* Update counters. */
1560 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1561 if (null_cnt > mctx->max_mb_elem_len)
1563 memset (sctx->sifted_states, '\0',
1564 sizeof (re_dfastate_t *) * str_idx);
1565 re_node_set_free (&cur_dest);
1568 re_node_set_empty (&cur_dest);
1571 if (mctx->state_log[str_idx])
1573 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1574 if (BE (err != REG_NOERROR, 0))
1578 /* Add all the nodes which satisfy the following conditions:
1579 - It can epsilon transit to a node in CUR_DEST.
1581 And update state_log. */
1582 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1583 if (BE (err != REG_NOERROR, 0))
1588 re_node_set_free (&cur_dest);
1592 static reg_errcode_t
1594 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1595 int str_idx, re_node_set *cur_dest)
1597 re_dfa_t *const dfa = mctx->dfa;
1598 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1601 /* Then build the next sifted state.
1602 We build the next sifted state on `cur_dest', and update
1603 `sifted_states[str_idx]' with `cur_dest'.
1605 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1606 `cur_src' points the node_set of the old `state_log[str_idx]'
1607 (with the epsilon nodes pre-filtered out). */
1608 for (i = 0; i < cur_src->nelem; i++)
1610 int prev_node = cur_src->elems[i];
1615 re_token_type_t type = dfa->nodes[prev_node].type;
1616 assert (!IS_EPSILON_NODE (type));
1618 #ifdef RE_ENABLE_I18N
1619 /* If the node may accept `multi byte'. */
1620 if (dfa->nodes[prev_node].accept_mb)
1621 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1622 str_idx, sctx->last_str_idx);
1623 #endif /* RE_ENABLE_I18N */
1625 /* We don't check backreferences here.
1626 See update_cur_sifted_state(). */
1628 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1629 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1630 dfa->nexts[prev_node]))
1636 if (sctx->limits.nelem)
1638 int to_idx = str_idx + naccepted;
1639 if (check_dst_limits (mctx, &sctx->limits,
1640 dfa->nexts[prev_node], to_idx,
1641 prev_node, str_idx))
1644 ret = re_node_set_insert (cur_dest, prev_node);
1645 if (BE (ret == -1, 0))
1652 /* Helper functions. */
1654 static reg_errcode_t
1656 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1658 int top = mctx->state_log_top;
1660 if (next_state_log_idx >= mctx->input.bufs_len
1661 || (next_state_log_idx >= mctx->input.valid_len
1662 && mctx->input.valid_len < mctx->input.len))
1665 err = extend_buffers (mctx);
1666 if (BE (err != REG_NOERROR, 0))
1670 if (top < next_state_log_idx)
1672 memset (mctx->state_log + top + 1, '\0',
1673 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1674 mctx->state_log_top = next_state_log_idx;
1679 static reg_errcode_t
1681 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1686 for (st_idx = 0; st_idx < num; ++st_idx)
1688 if (dst[st_idx] == NULL)
1689 dst[st_idx] = src[st_idx];
1690 else if (src[st_idx] != NULL)
1692 re_node_set merged_set;
1693 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1694 &src[st_idx]->nodes);
1695 if (BE (err != REG_NOERROR, 0))
1697 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1698 re_node_set_free (&merged_set);
1699 if (BE (err != REG_NOERROR, 0))
1706 static reg_errcode_t
1708 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1709 int str_idx, re_node_set *dest_nodes)
1711 re_dfa_t *const dfa = mctx->dfa;
1713 const re_node_set *candidates;
1714 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1715 : &mctx->state_log[str_idx]->nodes);
1717 if (dest_nodes->nelem == 0)
1718 sctx->sifted_states[str_idx] = NULL;
1723 /* At first, add the nodes which can epsilon transit to a node in
1725 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1726 if (BE (err != REG_NOERROR, 0))
1729 /* Then, check the limitations in the current sift_context. */
1730 if (sctx->limits.nelem)
1732 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1733 mctx->bkref_ents, str_idx);
1734 if (BE (err != REG_NOERROR, 0))
1739 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1740 if (BE (err != REG_NOERROR, 0))
1744 if (candidates && mctx->state_log[str_idx]->has_backref)
1746 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1747 if (BE (err != REG_NOERROR, 0))
1753 static reg_errcode_t
1755 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1756 const re_node_set *candidates)
1758 reg_errcode_t err = REG_NOERROR;
1761 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1762 if (BE (err != REG_NOERROR, 0))
1765 if (!state->inveclosure.alloc)
1767 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1768 if (BE (err != REG_NOERROR, 0))
1770 for (i = 0; i < dest_nodes->nelem; i++)
1771 re_node_set_merge (&state->inveclosure,
1772 dfa->inveclosures + dest_nodes->elems[i]);
1774 return re_node_set_add_intersect (dest_nodes, candidates,
1775 &state->inveclosure);
1778 static reg_errcode_t
1780 sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1781 const re_node_set *candidates)
1785 re_node_set *inv_eclosure = dfa->inveclosures + node;
1786 re_node_set except_nodes;
1787 re_node_set_init_empty (&except_nodes);
1788 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1790 int cur_node = inv_eclosure->elems[ecl_idx];
1791 if (cur_node == node)
1793 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1795 int edst1 = dfa->edests[cur_node].elems[0];
1796 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1797 ? dfa->edests[cur_node].elems[1] : -1);
1798 if ((!re_node_set_contains (inv_eclosure, edst1)
1799 && re_node_set_contains (dest_nodes, edst1))
1801 && !re_node_set_contains (inv_eclosure, edst2)
1802 && re_node_set_contains (dest_nodes, edst2)))
1804 err = re_node_set_add_intersect (&except_nodes, candidates,
1805 dfa->inveclosures + cur_node);
1806 if (BE (err != REG_NOERROR, 0))
1808 re_node_set_free (&except_nodes);
1814 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1816 int cur_node = inv_eclosure->elems[ecl_idx];
1817 if (!re_node_set_contains (&except_nodes, cur_node))
1819 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1820 re_node_set_remove_at (dest_nodes, idx);
1823 re_node_set_free (&except_nodes);
1829 check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
1830 int dst_node, int dst_idx, int src_node, int src_idx)
1832 re_dfa_t *const dfa = mctx->dfa;
1833 int lim_idx, src_pos, dst_pos;
1835 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1836 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1837 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1840 struct re_backref_cache_entry *ent;
1841 ent = mctx->bkref_ents + limits->elems[lim_idx];
1842 subexp_idx = dfa->nodes[ent->node].opr.idx;
1844 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1845 subexp_idx, dst_node, dst_idx,
1847 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1848 subexp_idx, src_node, src_idx,
1852 <src> <dst> ( <subexp> )
1853 ( <subexp> ) <src> <dst>
1854 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1855 if (src_pos == dst_pos)
1856 continue; /* This is unrelated limitation. */
1865 check_dst_limits_calc_pos_1 (re_match_context_t *mctx, int boundaries,
1866 int subexp_idx, int from_node, int bkref_idx)
1868 re_dfa_t *const dfa = mctx->dfa;
1869 re_node_set *eclosures = dfa->eclosures + from_node;
1872 /* Else, we are on the boundary: examine the nodes on the epsilon
1874 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1876 int node = eclosures->elems[node_idx];
1877 switch (dfa->nodes[node].type)
1880 if (bkref_idx != -1)
1882 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1887 if (ent->node != node)
1890 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1891 && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1894 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1895 OP_CLOSE_SUBEXP cases below. But, if the
1896 destination node is the same node as the source
1897 node, don't recurse because it would cause an
1898 infinite loop: a regex that exhibits this behavior
1900 dst = dfa->edests[node].elems[0];
1901 if (dst == from_node)
1905 else /* if (boundaries & 2) */
1910 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1912 if (cpos == -1 /* && (boundaries & 1) */)
1914 if (cpos == 0 && (boundaries & 2))
1917 ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1919 while (ent++->more);
1923 case OP_OPEN_SUBEXP:
1924 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1928 case OP_CLOSE_SUBEXP:
1929 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1938 return (boundaries & 2) ? 1 : 0;
1943 check_dst_limits_calc_pos (re_match_context_t *mctx, int limit, int subexp_idx,
1944 int from_node, int str_idx, int bkref_idx)
1946 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1949 /* If we are outside the range of the subexpression, return -1 or 1. */
1950 if (str_idx < lim->subexp_from)
1953 if (lim->subexp_to < str_idx)
1956 /* If we are within the subexpression, return 0. */
1957 boundaries = (str_idx == lim->subexp_from);
1958 boundaries |= (str_idx == lim->subexp_to) << 1;
1959 if (boundaries == 0)
1962 /* Else, examine epsilon closure. */
1963 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1964 from_node, bkref_idx);
1967 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1968 which are against limitations from DEST_NODES. */
1970 static reg_errcode_t
1972 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
1973 const re_node_set *candidates, re_node_set *limits,
1974 struct re_backref_cache_entry *bkref_ents, int str_idx)
1977 int node_idx, lim_idx;
1979 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1982 struct re_backref_cache_entry *ent;
1983 ent = bkref_ents + limits->elems[lim_idx];
1985 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1986 continue; /* This is unrelated limitation. */
1988 subexp_idx = dfa->nodes[ent->node].opr.idx;
1989 if (ent->subexp_to == str_idx)
1993 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1995 int node = dest_nodes->elems[node_idx];
1996 re_token_type_t type = dfa->nodes[node].type;
1997 if (type == OP_OPEN_SUBEXP
1998 && subexp_idx == dfa->nodes[node].opr.idx)
2000 else if (type == OP_CLOSE_SUBEXP
2001 && subexp_idx == dfa->nodes[node].opr.idx)
2005 /* Check the limitation of the open subexpression. */
2006 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2009 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2011 if (BE (err != REG_NOERROR, 0))
2015 /* Check the limitation of the close subexpression. */
2017 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2019 int node = dest_nodes->elems[node_idx];
2020 if (!re_node_set_contains (dfa->inveclosures + node,
2022 && !re_node_set_contains (dfa->eclosures + node,
2025 /* It is against this limitation.
2026 Remove it form the current sifted state. */
2027 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2029 if (BE (err != REG_NOERROR, 0))
2035 else /* (ent->subexp_to != str_idx) */
2037 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2039 int node = dest_nodes->elems[node_idx];
2040 re_token_type_t type = dfa->nodes[node].type;
2041 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2043 if (subexp_idx != dfa->nodes[node].opr.idx)
2045 /* It is against this limitation.
2046 Remove it form the current sifted state. */
2047 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2049 if (BE (err != REG_NOERROR, 0))
2058 static reg_errcode_t
2060 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2061 int str_idx, const re_node_set *candidates)
2063 re_dfa_t *const dfa = mctx->dfa;
2066 re_sift_context_t local_sctx;
2067 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2069 if (first_idx == -1)
2072 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2074 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2077 re_token_type_t type;
2078 struct re_backref_cache_entry *entry;
2079 node = candidates->elems[node_idx];
2080 type = dfa->nodes[node].type;
2081 /* Avoid infinite loop for the REs like "()\1+". */
2082 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2084 if (type != OP_BACK_REF)
2087 entry = mctx->bkref_ents + first_idx;
2088 enabled_idx = first_idx;
2091 int subexp_len, to_idx, dst_node;
2092 re_dfastate_t *cur_state;
2094 if (entry->node != node)
2096 subexp_len = entry->subexp_to - entry->subexp_from;
2097 to_idx = str_idx + subexp_len;
2098 dst_node = (subexp_len ? dfa->nexts[node]
2099 : dfa->edests[node].elems[0]);
2101 if (to_idx > sctx->last_str_idx
2102 || sctx->sifted_states[to_idx] == NULL
2103 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2104 || check_dst_limits (mctx, &sctx->limits, node,
2105 str_idx, dst_node, to_idx))
2108 if (local_sctx.sifted_states == NULL)
2111 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2112 if (BE (err != REG_NOERROR, 0))
2115 local_sctx.last_node = node;
2116 local_sctx.last_str_idx = str_idx;
2117 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2118 if (BE (err < 0, 0))
2123 cur_state = local_sctx.sifted_states[str_idx];
2124 err = sift_states_backward (mctx, &local_sctx);
2125 if (BE (err != REG_NOERROR, 0))
2127 if (sctx->limited_states != NULL)
2129 err = merge_state_array (dfa, sctx->limited_states,
2130 local_sctx.sifted_states,
2132 if (BE (err != REG_NOERROR, 0))
2135 local_sctx.sifted_states[str_idx] = cur_state;
2136 re_node_set_remove (&local_sctx.limits, enabled_idx);
2138 /* mctx->bkref_ents may have changed, reload the pointer. */
2139 entry = mctx->bkref_ents + enabled_idx;
2141 while (enabled_idx++, entry++->more);
2145 if (local_sctx.sifted_states != NULL)
2147 re_node_set_free (&local_sctx.limits);
2154 #ifdef RE_ENABLE_I18N
2157 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2158 int node_idx, int str_idx, int max_str_idx)
2160 re_dfa_t *const dfa = mctx->dfa;
2162 /* Check the node can accept `multi byte'. */
2163 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2164 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2165 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2166 dfa->nexts[node_idx]))
2167 /* The node can't accept the `multi byte', or the
2168 destination was already thrown away, then the node
2169 could't accept the current input `multi byte'. */
2171 /* Otherwise, it is sure that the node could accept
2172 `naccepted' bytes input. */
2175 #endif /* RE_ENABLE_I18N */
2178 /* Functions for state transition. */
2180 /* Return the next state to which the current state STATE will transit by
2181 accepting the current input byte, and update STATE_LOG if necessary.
2182 If STATE can accept a multibyte char/collating element/back reference
2183 update the destination of STATE_LOG. */
2185 static re_dfastate_t *
2187 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2188 re_dfastate_t *state)
2190 re_dfastate_t **trtable;
2193 #ifdef RE_ENABLE_I18N
2194 /* If the current state can accept multibyte. */
2195 if (BE (state->accept_mb, 0))
2197 *err = transit_state_mb (mctx, state);
2198 if (BE (*err != REG_NOERROR, 0))
2201 #endif /* RE_ENABLE_I18N */
2203 /* Then decide the next state with the single byte. */
2206 /* don't use transition table */
2207 return transit_state_sb (err, mctx, state);
2210 /* Use transition table */
2211 ch = re_string_fetch_byte (&mctx->input);
2214 trtable = state->trtable;
2215 if (BE (trtable != NULL, 1))
2218 trtable = state->word_trtable;
2219 if (BE (trtable != NULL, 1))
2221 unsigned int context;
2223 = re_string_context_at (&mctx->input,
2224 re_string_cur_idx (&mctx->input) - 1,
2226 if (IS_WORD_CONTEXT (context))
2227 return trtable[ch + SBC_MAX];
2232 if (!build_trtable (mctx->dfa, state))
2238 /* Retry, we now have a transition table. */
2242 /* Update the state_log if we need */
2245 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2246 re_dfastate_t *next_state)
2248 re_dfa_t *const dfa = mctx->dfa;
2249 int cur_idx = re_string_cur_idx (&mctx->input);
2251 if (cur_idx > mctx->state_log_top)
2253 mctx->state_log[cur_idx] = next_state;
2254 mctx->state_log_top = cur_idx;
2256 else if (mctx->state_log[cur_idx] == 0)
2258 mctx->state_log[cur_idx] = next_state;
2262 re_dfastate_t *pstate;
2263 unsigned int context;
2264 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2265 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2266 the destination of a multibyte char/collating element/
2267 back reference. Then the next state is the union set of
2268 these destinations and the results of the transition table. */
2269 pstate = mctx->state_log[cur_idx];
2270 log_nodes = pstate->entrance_nodes;
2271 if (next_state != NULL)
2273 table_nodes = next_state->entrance_nodes;
2274 *err = re_node_set_init_union (&next_nodes, table_nodes,
2276 if (BE (*err != REG_NOERROR, 0))
2280 next_nodes = *log_nodes;
2281 /* Note: We already add the nodes of the initial state,
2282 then we don't need to add them here. */
2284 context = re_string_context_at (&mctx->input,
2285 re_string_cur_idx (&mctx->input) - 1,
2287 next_state = mctx->state_log[cur_idx]
2288 = re_acquire_state_context (err, dfa, &next_nodes, context);
2289 /* We don't need to check errors here, since the return value of
2290 this function is next_state and ERR is already set. */
2292 if (table_nodes != NULL)
2293 re_node_set_free (&next_nodes);
2296 if (BE (dfa->nbackref, 0) && next_state != NULL)
2298 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2299 later. We must check them here, since the back references in the
2300 next state might use them. */
2301 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2303 if (BE (*err != REG_NOERROR, 0))
2306 /* If the next state has back references. */
2307 if (next_state->has_backref)
2309 *err = transit_state_bkref (mctx, &next_state->nodes);
2310 if (BE (*err != REG_NOERROR, 0))
2312 next_state = mctx->state_log[cur_idx];
2319 /* Skip bytes in the input that correspond to part of a
2320 multi-byte match, then look in the log for a state
2321 from which to restart matching. */
2322 static re_dfastate_t *
2324 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2326 re_dfastate_t *cur_state = NULL;
2329 int max = mctx->state_log_top;
2330 int cur_str_idx = re_string_cur_idx (&mctx->input);
2334 if (++cur_str_idx > max)
2336 re_string_skip_bytes (&mctx->input, 1);
2338 while (mctx->state_log[cur_str_idx] == NULL);
2340 cur_state = merge_state_with_log (err, mctx, NULL);
2342 while (err == REG_NOERROR && cur_state == NULL);
2346 /* Helper functions for transit_state. */
2348 /* From the node set CUR_NODES, pick up the nodes whose types are
2349 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2350 expression. And register them to use them later for evaluating the
2351 correspoding back references. */
2353 static reg_errcode_t
2355 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2358 re_dfa_t *const dfa = mctx->dfa;
2362 /* TODO: This isn't efficient.
2363 Because there might be more than one nodes whose types are
2364 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2367 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2369 int node = cur_nodes->elems[node_idx];
2370 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2371 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2372 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2374 err = match_ctx_add_subtop (mctx, node, str_idx);
2375 if (BE (err != REG_NOERROR, 0))
2383 /* Return the next state to which the current state STATE will transit by
2384 accepting the current input byte. */
2386 static re_dfastate_t *
2387 transit_state_sb (err, mctx, state)
2389 re_match_context_t *mctx;
2390 re_dfastate_t *state;
2392 re_dfa_t *const dfa = mctx->dfa;
2393 re_node_set next_nodes;
2394 re_dfastate_t *next_state;
2395 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2396 unsigned int context;
2398 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2399 if (BE (*err != REG_NOERROR, 0))
2401 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2403 int cur_node = state->nodes.elems[node_cnt];
2404 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2406 *err = re_node_set_merge (&next_nodes,
2407 dfa->eclosures + dfa->nexts[cur_node]);
2408 if (BE (*err != REG_NOERROR, 0))
2410 re_node_set_free (&next_nodes);
2415 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2416 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2417 /* We don't need to check errors here, since the return value of
2418 this function is next_state and ERR is already set. */
2420 re_node_set_free (&next_nodes);
2421 re_string_skip_bytes (&mctx->input, 1);
2426 #ifdef RE_ENABLE_I18N
2427 static reg_errcode_t
2429 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2431 re_dfa_t *const dfa = mctx->dfa;
2435 for (i = 0; i < pstate->nodes.nelem; ++i)
2437 re_node_set dest_nodes, *new_nodes;
2438 int cur_node_idx = pstate->nodes.elems[i];
2439 int naccepted, dest_idx;
2440 unsigned int context;
2441 re_dfastate_t *dest_state;
2443 if (!dfa->nodes[cur_node_idx].accept_mb)
2446 if (dfa->nodes[cur_node_idx].constraint)
2448 context = re_string_context_at (&mctx->input,
2449 re_string_cur_idx (&mctx->input),
2451 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2456 /* How many bytes the node can accept? */
2457 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2458 re_string_cur_idx (&mctx->input));
2462 /* The node can accepts `naccepted' bytes. */
2463 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2464 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2465 : mctx->max_mb_elem_len);
2466 err = clean_state_log_if_needed (mctx, dest_idx);
2467 if (BE (err != REG_NOERROR, 0))
2470 assert (dfa->nexts[cur_node_idx] != -1);
2472 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2474 dest_state = mctx->state_log[dest_idx];
2475 if (dest_state == NULL)
2476 dest_nodes = *new_nodes;
2479 err = re_node_set_init_union (&dest_nodes,
2480 dest_state->entrance_nodes, new_nodes);
2481 if (BE (err != REG_NOERROR, 0))
2484 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2485 mctx->state_log[dest_idx]
2486 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2487 if (dest_state != NULL)
2488 re_node_set_free (&dest_nodes);
2489 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2494 #endif /* RE_ENABLE_I18N */
2496 static reg_errcode_t
2498 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2500 re_dfa_t *const dfa = mctx->dfa;
2503 int cur_str_idx = re_string_cur_idx (&mctx->input);
2505 for (i = 0; i < nodes->nelem; ++i)
2507 int dest_str_idx, prev_nelem, bkc_idx;
2508 int node_idx = nodes->elems[i];
2509 unsigned int context;
2510 const re_token_t *node = dfa->nodes + node_idx;
2511 re_node_set *new_dest_nodes;
2513 /* Check whether `node' is a backreference or not. */
2514 if (node->type != OP_BACK_REF)
2517 if (node->constraint)
2519 context = re_string_context_at (&mctx->input, cur_str_idx,
2521 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2525 /* `node' is a backreference.
2526 Check the substring which the substring matched. */
2527 bkc_idx = mctx->nbkref_ents;
2528 err = get_subexp (mctx, node_idx, cur_str_idx);
2529 if (BE (err != REG_NOERROR, 0))
2532 /* And add the epsilon closures (which is `new_dest_nodes') of
2533 the backreference to appropriate state_log. */
2535 assert (dfa->nexts[node_idx] != -1);
2537 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2540 re_dfastate_t *dest_state;
2541 struct re_backref_cache_entry *bkref_ent;
2542 bkref_ent = mctx->bkref_ents + bkc_idx;
2543 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2545 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2546 new_dest_nodes = (subexp_len == 0
2547 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2548 : dfa->eclosures + dfa->nexts[node_idx]);
2549 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2550 - bkref_ent->subexp_from);
2551 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2553 dest_state = mctx->state_log[dest_str_idx];
2554 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2555 : mctx->state_log[cur_str_idx]->nodes.nelem);
2556 /* Add `new_dest_node' to state_log. */
2557 if (dest_state == NULL)
2559 mctx->state_log[dest_str_idx]
2560 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2562 if (BE (mctx->state_log[dest_str_idx] == NULL
2563 && err != REG_NOERROR, 0))
2568 re_node_set dest_nodes;
2569 err = re_node_set_init_union (&dest_nodes,
2570 dest_state->entrance_nodes,
2572 if (BE (err != REG_NOERROR, 0))
2574 re_node_set_free (&dest_nodes);
2577 mctx->state_log[dest_str_idx]
2578 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2579 re_node_set_free (&dest_nodes);
2580 if (BE (mctx->state_log[dest_str_idx] == NULL
2581 && err != REG_NOERROR, 0))
2584 /* We need to check recursively if the backreference can epsilon
2587 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2589 err = check_subexp_matching_top (mctx, new_dest_nodes,
2591 if (BE (err != REG_NOERROR, 0))
2593 err = transit_state_bkref (mctx, new_dest_nodes);
2594 if (BE (err != REG_NOERROR, 0))
2604 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2605 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2606 Note that we might collect inappropriate candidates here.
2607 However, the cost of checking them strictly here is too high, then we
2608 delay these checking for prune_impossible_nodes(). */
2610 static reg_errcode_t
2612 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2614 re_dfa_t *const dfa = mctx->dfa;
2615 int subexp_num, sub_top_idx;
2616 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2617 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2618 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2619 if (cache_idx != -1)
2621 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2623 if (entry->node == bkref_node)
2624 return REG_NOERROR; /* We already checked it. */
2625 while (entry++->more);
2628 subexp_num = dfa->nodes[bkref_node].opr.idx;
2630 /* For each sub expression */
2631 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2634 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2635 re_sub_match_last_t *sub_last;
2636 int sub_last_idx, sl_str, bkref_str_off;
2638 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2639 continue; /* It isn't related. */
2641 sl_str = sub_top->str_idx;
2642 bkref_str_off = bkref_str_idx;
2643 /* At first, check the last node of sub expressions we already
2645 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2648 sub_last = sub_top->lasts[sub_last_idx];
2649 sl_str_diff = sub_last->str_idx - sl_str;
2650 /* The matched string by the sub expression match with the substring
2651 at the back reference? */
2652 if (sl_str_diff > 0)
2654 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2656 /* Not enough chars for a successful match. */
2657 if (bkref_str_off + sl_str_diff > mctx->input.len)
2660 err = clean_state_log_if_needed (mctx,
2663 if (BE (err != REG_NOERROR, 0))
2665 buf = (const char *) re_string_get_buffer (&mctx->input);
2667 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2668 break; /* We don't need to search this sub expression any more. */
2670 bkref_str_off += sl_str_diff;
2671 sl_str += sl_str_diff;
2672 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2675 /* Reload buf, since the preceding call might have reallocated
2677 buf = (const char *) re_string_get_buffer (&mctx->input);
2679 if (err == REG_NOMATCH)
2681 if (BE (err != REG_NOERROR, 0))
2685 if (sub_last_idx < sub_top->nlasts)
2687 if (sub_last_idx > 0)
2689 /* Then, search for the other last nodes of the sub expression. */
2690 for (; sl_str <= bkref_str_idx; ++sl_str)
2692 int cls_node, sl_str_off;
2693 const re_node_set *nodes;
2694 sl_str_off = sl_str - sub_top->str_idx;
2695 /* The matched string by the sub expression match with the substring
2696 at the back reference? */
2699 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2701 /* If we are at the end of the input, we cannot match. */
2702 if (bkref_str_off >= mctx->input.len)
2705 err = extend_buffers (mctx);
2706 if (BE (err != REG_NOERROR, 0))
2709 buf = (const char *) re_string_get_buffer (&mctx->input);
2711 if (buf [bkref_str_off++] != buf[sl_str - 1])
2712 break; /* We don't need to search this sub expression
2715 if (mctx->state_log[sl_str] == NULL)
2717 /* Does this state have a ')' of the sub expression? */
2718 nodes = &mctx->state_log[sl_str]->nodes;
2719 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2722 if (sub_top->path == NULL)
2724 sub_top->path = calloc (sizeof (state_array_t),
2725 sl_str - sub_top->str_idx + 1);
2726 if (sub_top->path == NULL)
2729 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2730 in the current context? */
2731 err = check_arrival (mctx, sub_top->path, sub_top->node,
2732 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2733 if (err == REG_NOMATCH)
2735 if (BE (err != REG_NOERROR, 0))
2737 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2738 if (BE (sub_last == NULL, 0))
2740 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2742 if (err == REG_NOMATCH)
2749 /* Helper functions for get_subexp(). */
2751 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2752 If it can arrive, register the sub expression expressed with SUB_TOP
2755 static reg_errcode_t
2757 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2758 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2762 /* Can the subexpression arrive the back reference? */
2763 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2764 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2765 if (err != REG_NOERROR)
2767 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2769 if (BE (err != REG_NOERROR, 0))
2771 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2772 return clean_state_log_if_needed (mctx, to_idx);
2775 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2776 Search '(' if FL_OPEN, or search ')' otherwise.
2777 TODO: This function isn't efficient...
2778 Because there might be more than one nodes whose types are
2779 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2785 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2786 int subexp_idx, int type)
2789 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2791 int cls_node = nodes->elems[cls_idx];
2792 const re_token_t *node = dfa->nodes + cls_node;
2793 if (node->type == type
2794 && node->opr.idx == subexp_idx)
2800 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2801 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2803 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2805 static reg_errcode_t
2807 check_arrival (re_match_context_t *mctx, state_array_t *path,
2808 int top_node, int top_str, int last_node, int last_str,
2811 re_dfa_t *const dfa = mctx->dfa;
2813 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2814 re_dfastate_t *cur_state = NULL;
2815 re_node_set *cur_nodes, next_nodes;
2816 re_dfastate_t **backup_state_log;
2817 unsigned int context;
2819 subexp_num = dfa->nodes[top_node].opr.idx;
2820 /* Extend the buffer if we need. */
2821 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2823 re_dfastate_t **new_array;
2824 int old_alloc = path->alloc;
2825 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2826 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2827 if (new_array == NULL)
2829 path->alloc = old_alloc;
2832 path->array = new_array;
2833 memset (new_array + old_alloc, '\0',
2834 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2837 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2839 /* Temporary modify MCTX. */
2840 backup_state_log = mctx->state_log;
2841 backup_cur_idx = mctx->input.cur_idx;
2842 mctx->state_log = path->array;
2843 mctx->input.cur_idx = str_idx;
2845 /* Setup initial node set. */
2846 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2847 if (str_idx == top_str)
2849 err = re_node_set_init_1 (&next_nodes, top_node);
2850 if (BE (err != REG_NOERROR, 0))
2852 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2853 if (BE (err != REG_NOERROR, 0))
2855 re_node_set_free (&next_nodes);
2861 cur_state = mctx->state_log[str_idx];
2862 if (cur_state && cur_state->has_backref)
2864 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2865 if (BE ( err != REG_NOERROR, 0))
2869 re_node_set_init_empty (&next_nodes);
2871 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2873 if (next_nodes.nelem)
2875 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2877 if (BE ( err != REG_NOERROR, 0))
2879 re_node_set_free (&next_nodes);
2883 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2884 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2886 re_node_set_free (&next_nodes);
2889 mctx->state_log[str_idx] = cur_state;
2892 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2894 re_node_set_empty (&next_nodes);
2895 if (mctx->state_log[str_idx + 1])
2897 err = re_node_set_merge (&next_nodes,
2898 &mctx->state_log[str_idx + 1]->nodes);
2899 if (BE (err != REG_NOERROR, 0))
2901 re_node_set_free (&next_nodes);
2907 err = check_arrival_add_next_nodes (mctx, str_idx,
2908 &cur_state->non_eps_nodes, &next_nodes);
2909 if (BE (err != REG_NOERROR, 0))
2911 re_node_set_free (&next_nodes);
2916 if (next_nodes.nelem)
2918 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2919 if (BE (err != REG_NOERROR, 0))
2921 re_node_set_free (&next_nodes);
2924 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2926 if (BE ( err != REG_NOERROR, 0))
2928 re_node_set_free (&next_nodes);
2932 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2933 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2934 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2936 re_node_set_free (&next_nodes);
2939 mctx->state_log[str_idx] = cur_state;
2940 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2942 re_node_set_free (&next_nodes);
2943 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2944 : &mctx->state_log[last_str]->nodes);
2945 path->next_idx = str_idx;
2948 mctx->state_log = backup_state_log;
2949 mctx->input.cur_idx = backup_cur_idx;
2951 /* Then check the current node set has the node LAST_NODE. */
2952 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2958 /* Helper functions for check_arrival. */
2960 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2962 TODO: This function is similar to the functions transit_state*(),
2963 however this function has many additional works.
2964 Can't we unify them? */
2966 static reg_errcode_t
2968 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2969 re_node_set *cur_nodes,
2970 re_node_set *next_nodes)
2972 re_dfa_t *const dfa = mctx->dfa;
2976 re_node_set union_set;
2977 re_node_set_init_empty (&union_set);
2978 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2981 int cur_node = cur_nodes->elems[cur_idx];
2983 re_token_type_t type = dfa->nodes[cur_node].type;
2984 assert (!IS_EPSILON_NODE (type));
2986 #ifdef RE_ENABLE_I18N
2987 /* If the node may accept `multi byte'. */
2988 if (dfa->nodes[cur_node].accept_mb)
2990 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2994 re_dfastate_t *dest_state;
2995 int next_node = dfa->nexts[cur_node];
2996 int next_idx = str_idx + naccepted;
2997 dest_state = mctx->state_log[next_idx];
2998 re_node_set_empty (&union_set);
3001 err = re_node_set_merge (&union_set, &dest_state->nodes);
3002 if (BE (err != REG_NOERROR, 0))
3004 re_node_set_free (&union_set);
3008 result = re_node_set_insert (&union_set, next_node);
3009 if (BE (result < 0, 0))
3011 re_node_set_free (&union_set);
3014 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3016 if (BE (mctx->state_log[next_idx] == NULL
3017 && err != REG_NOERROR, 0))
3019 re_node_set_free (&union_set);
3024 #endif /* RE_ENABLE_I18N */
3026 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3028 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3029 if (BE (result < 0, 0))
3031 re_node_set_free (&union_set);
3036 re_node_set_free (&union_set);
3040 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3041 CUR_NODES, however exclude the nodes which are:
3042 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3043 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3046 static reg_errcode_t
3048 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3049 int ex_subexp, int type)
3052 int idx, outside_node;
3053 re_node_set new_nodes;
3055 assert (cur_nodes->nelem);
3057 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3058 if (BE (err != REG_NOERROR, 0))
3060 /* Create a new node set NEW_NODES with the nodes which are epsilon
3061 closures of the node in CUR_NODES. */
3063 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3065 int cur_node = cur_nodes->elems[idx];
3066 re_node_set *eclosure = dfa->eclosures + cur_node;
3067 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3068 if (outside_node == -1)
3070 /* There are no problematic nodes, just merge them. */
3071 err = re_node_set_merge (&new_nodes, eclosure);
3072 if (BE (err != REG_NOERROR, 0))
3074 re_node_set_free (&new_nodes);
3080 /* There are problematic nodes, re-calculate incrementally. */
3081 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3083 if (BE (err != REG_NOERROR, 0))
3085 re_node_set_free (&new_nodes);
3090 re_node_set_free (cur_nodes);
3091 *cur_nodes = new_nodes;
3095 /* Helper function for check_arrival_expand_ecl.
3096 Check incrementally the epsilon closure of TARGET, and if it isn't
3097 problematic append it to DST_NODES. */
3099 static reg_errcode_t
3101 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3102 int target, int ex_subexp, int type)
3105 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3109 if (dfa->nodes[cur_node].type == type
3110 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3112 if (type == OP_CLOSE_SUBEXP)
3114 err = re_node_set_insert (dst_nodes, cur_node);
3115 if (BE (err == -1, 0))
3120 err = re_node_set_insert (dst_nodes, cur_node);
3121 if (BE (err == -1, 0))
3123 if (dfa->edests[cur_node].nelem == 0)
3125 if (dfa->edests[cur_node].nelem == 2)
3127 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3128 dfa->edests[cur_node].elems[1],
3130 if (BE (err != REG_NOERROR, 0))
3133 cur_node = dfa->edests[cur_node].elems[0];
3139 /* For all the back references in the current state, calculate the
3140 destination of the back references by the appropriate entry
3141 in MCTX->BKREF_ENTS. */
3143 static reg_errcode_t
3145 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3146 int cur_str, int subexp_num, int type)
3148 re_dfa_t *const dfa = mctx->dfa;
3150 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3151 struct re_backref_cache_entry *ent;
3153 if (cache_idx_start == -1)
3157 ent = mctx->bkref_ents + cache_idx_start;
3160 int to_idx, next_node;
3162 /* Is this entry ENT is appropriate? */
3163 if (!re_node_set_contains (cur_nodes, ent->node))
3166 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3167 /* Calculate the destination of the back reference, and append it
3168 to MCTX->STATE_LOG. */
3169 if (to_idx == cur_str)
3171 /* The backreference did epsilon transit, we must re-check all the
3172 node in the current state. */
3173 re_node_set new_dests;
3174 reg_errcode_t err2, err3;
3175 next_node = dfa->edests[ent->node].elems[0];
3176 if (re_node_set_contains (cur_nodes, next_node))
3178 err = re_node_set_init_1 (&new_dests, next_node);
3179 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3180 err3 = re_node_set_merge (cur_nodes, &new_dests);
3181 re_node_set_free (&new_dests);
3182 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3183 || err3 != REG_NOERROR, 0))
3185 err = (err != REG_NOERROR ? err
3186 : (err2 != REG_NOERROR ? err2 : err3));
3189 /* TODO: It is still inefficient... */
3194 re_node_set union_set;
3195 next_node = dfa->nexts[ent->node];
3196 if (mctx->state_log[to_idx])
3199 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3202 err = re_node_set_init_copy (&union_set,
3203 &mctx->state_log[to_idx]->nodes);
3204 ret = re_node_set_insert (&union_set, next_node);
3205 if (BE (err != REG_NOERROR || ret < 0, 0))
3207 re_node_set_free (&union_set);
3208 err = err != REG_NOERROR ? err : REG_ESPACE;
3214 err = re_node_set_init_1 (&union_set, next_node);
3215 if (BE (err != REG_NOERROR, 0))
3218 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3219 re_node_set_free (&union_set);
3220 if (BE (mctx->state_log[to_idx] == NULL
3221 && err != REG_NOERROR, 0))
3225 while (ent++->more);
3229 /* Build transition table for the state.
3230 Return 1 if succeeded, otherwise return NULL. */
3234 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3237 int i, j, ch, need_word_trtable = 0;
3238 unsigned int elem, mask;
3239 int dests_node_malloced = 0, dest_states_malloced = 0;
3240 int ndests; /* Number of the destination states from `state'. */
3241 re_dfastate_t **trtable;
3242 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3243 re_node_set follows, *dests_node;
3247 /* We build DFA states which corresponds to the destination nodes
3248 from `state'. `dests_node[i]' represents the nodes which i-th
3249 destination state contains, and `dests_ch[i]' represents the
3250 characters which i-th destination state accepts. */
3252 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3253 dests_node = (re_node_set *)
3254 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3258 dests_node = (re_node_set *)
3259 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3260 if (BE (dests_node == NULL, 0))
3262 dests_node_malloced = 1;
3264 dests_ch = (bitset *) (dests_node + SBC_MAX);
3266 /* Initialize transiton table. */
3267 state->word_trtable = state->trtable = NULL;
3269 /* At first, group all nodes belonging to `state' into several
3271 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3272 if (BE (ndests <= 0, 0))
3274 if (dests_node_malloced)
3276 /* Return 0 in case of an error, 1 otherwise. */
3279 state->trtable = (re_dfastate_t **)
3280 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3286 err = re_node_set_alloc (&follows, ndests + 1);
3287 if (BE (err != REG_NOERROR, 0))
3291 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3292 + ndests * 3 * sizeof (re_dfastate_t *)))
3293 dest_states = (re_dfastate_t **)
3294 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3298 dest_states = (re_dfastate_t **)
3299 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3300 if (BE (dest_states == NULL, 0))
3303 if (dest_states_malloced)
3305 re_node_set_free (&follows);
3306 for (i = 0; i < ndests; ++i)
3307 re_node_set_free (dests_node + i);
3308 if (dests_node_malloced)
3312 dest_states_malloced = 1;
3314 dest_states_word = dest_states + ndests;
3315 dest_states_nl = dest_states_word + ndests;
3316 bitset_empty (acceptable);
3318 /* Then build the states for all destinations. */
3319 for (i = 0; i < ndests; ++i)
3322 re_node_set_empty (&follows);
3323 /* Merge the follows of this destination states. */
3324 for (j = 0; j < dests_node[i].nelem; ++j)
3326 next_node = dfa->nexts[dests_node[i].elems[j]];
3327 if (next_node != -1)
3329 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3330 if (BE (err != REG_NOERROR, 0))
3334 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3335 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3337 /* If the new state has context constraint,
3338 build appropriate states for these contexts. */
3339 if (dest_states[i]->has_constraint)
3341 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3343 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3346 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3347 need_word_trtable = 1;
3349 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3351 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3356 dest_states_word[i] = dest_states[i];
3357 dest_states_nl[i] = dest_states[i];
3359 bitset_merge (acceptable, dests_ch[i]);
3362 if (!BE (need_word_trtable, 0))
3364 /* We don't care about whether the following character is a word
3365 character, or we are in a single-byte character set so we can
3366 discern by looking at the character code: allocate a
3367 256-entry transition table. */
3368 trtable = state->trtable =
3369 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3370 if (BE (trtable == NULL, 0))
3373 /* For all characters ch...: */
3374 for (i = 0; i < BITSET_UINTS; ++i)
3375 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3377 mask <<= 1, elem >>= 1, ++ch)
3378 if (BE (elem & 1, 0))
3380 /* There must be exactly one destination which accepts
3381 character ch. See group_nodes_into_DFAstates. */
3382 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3385 /* j-th destination accepts the word character ch. */
3386 if (dfa->word_char[i] & mask)
3387 trtable[ch] = dest_states_word[j];
3389 trtable[ch] = dest_states[j];
3394 /* We care about whether the following character is a word
3395 character, and we are in a multi-byte character set: discern
3396 by looking at the character code: build two 256-entry
3397 transition tables, one starting at trtable[0] and one
3398 starting at trtable[SBC_MAX]. */
3399 trtable = state->word_trtable =
3400 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3401 if (BE (trtable == NULL, 0))
3404 /* For all characters ch...: */
3405 for (i = 0; i < BITSET_UINTS; ++i)
3406 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3408 mask <<= 1, elem >>= 1, ++ch)
3409 if (BE (elem & 1, 0))
3411 /* There must be exactly one destination which accepts
3412 character ch. See group_nodes_into_DFAstates. */
3413 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3416 /* j-th destination accepts the word character ch. */
3417 trtable[ch] = dest_states[j];
3418 trtable[ch + SBC_MAX] = dest_states_word[j];
3423 if (bitset_contain (acceptable, NEWLINE_CHAR))
3425 /* The current state accepts newline character. */
3426 for (j = 0; j < ndests; ++j)
3427 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3429 /* k-th destination accepts newline character. */
3430 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3431 if (need_word_trtable)
3432 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3433 /* There must be only one destination which accepts
3434 newline. See group_nodes_into_DFAstates. */
3439 if (dest_states_malloced)
3442 re_node_set_free (&follows);
3443 for (i = 0; i < ndests; ++i)
3444 re_node_set_free (dests_node + i);
3446 if (dests_node_malloced)
3452 /* Group all nodes belonging to STATE into several destinations.
3453 Then for all destinations, set the nodes belonging to the destination
3454 to DESTS_NODE[i] and set the characters accepted by the destination
3455 to DEST_CH[i]. This function return the number of destinations. */
3459 group_nodes_into_DFAstates (re_dfa_t *dfa, const re_dfastate_t *state,
3460 re_node_set *dests_node, bitset *dests_ch)
3465 int ndests; /* Number of the destinations from `state'. */
3466 bitset accepts; /* Characters a node can accept. */
3467 const re_node_set *cur_nodes = &state->nodes;
3468 bitset_empty (accepts);
3471 /* For all the nodes belonging to `state', */
3472 for (i = 0; i < cur_nodes->nelem; ++i)
3474 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3475 re_token_type_t type = node->type;
3476 unsigned int constraint = node->constraint;
3478 /* Enumerate all single byte character this node can accept. */
3479 if (type == CHARACTER)
3480 bitset_set (accepts, node->opr.c);
3481 else if (type == SIMPLE_BRACKET)
3483 bitset_merge (accepts, node->opr.sbcset);
3485 else if (type == OP_PERIOD)
3487 #ifdef RE_ENABLE_I18N
3488 if (dfa->mb_cur_max > 1)
3489 bitset_merge (accepts, dfa->sb_char);
3492 bitset_set_all (accepts);
3493 if (!(dfa->syntax & RE_DOT_NEWLINE))
3494 bitset_clear (accepts, '\n');
3495 if (dfa->syntax & RE_DOT_NOT_NULL)
3496 bitset_clear (accepts, '\0');
3498 #ifdef RE_ENABLE_I18N
3499 else if (type == OP_UTF8_PERIOD)
3501 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3502 if (!(dfa->syntax & RE_DOT_NEWLINE))
3503 bitset_clear (accepts, '\n');
3504 if (dfa->syntax & RE_DOT_NOT_NULL)
3505 bitset_clear (accepts, '\0');
3511 /* Check the `accepts' and sift the characters which are not
3512 match it the context. */
3515 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3517 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3518 bitset_empty (accepts);
3519 if (accepts_newline)
3520 bitset_set (accepts, NEWLINE_CHAR);
3524 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3526 bitset_empty (accepts);
3530 if (constraint & NEXT_WORD_CONSTRAINT)
3532 unsigned int any_set = 0;
3533 if (type == CHARACTER && !node->word_char)
3535 bitset_empty (accepts);
3538 #ifdef RE_ENABLE_I18N
3539 if (dfa->mb_cur_max > 1)
3540 for (j = 0; j < BITSET_UINTS; ++j)
3541 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3544 for (j = 0; j < BITSET_UINTS; ++j)
3545 any_set |= (accepts[j] &= dfa->word_char[j]);
3549 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3551 unsigned int any_set = 0;
3552 if (type == CHARACTER && node->word_char)
3554 bitset_empty (accepts);
3557 #ifdef RE_ENABLE_I18N
3558 if (dfa->mb_cur_max > 1)
3559 for (j = 0; j < BITSET_UINTS; ++j)
3560 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3563 for (j = 0; j < BITSET_UINTS; ++j)
3564 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3570 /* Then divide `accepts' into DFA states, or create a new
3571 state. Above, we make sure that accepts is not empty. */
3572 for (j = 0; j < ndests; ++j)
3574 bitset intersec; /* Intersection sets, see below. */
3576 /* Flags, see below. */
3577 int has_intersec, not_subset, not_consumed;
3579 /* Optimization, skip if this state doesn't accept the character. */
3580 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3583 /* Enumerate the intersection set of this state and `accepts'. */
3585 for (k = 0; k < BITSET_UINTS; ++k)
3586 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3587 /* And skip if the intersection set is empty. */
3591 /* Then check if this state is a subset of `accepts'. */
3592 not_subset = not_consumed = 0;
3593 for (k = 0; k < BITSET_UINTS; ++k)
3595 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3596 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3599 /* If this state isn't a subset of `accepts', create a
3600 new group state, which has the `remains'. */
3603 bitset_copy (dests_ch[ndests], remains);
3604 bitset_copy (dests_ch[j], intersec);
3605 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3606 if (BE (err != REG_NOERROR, 0))
3611 /* Put the position in the current group. */
3612 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3613 if (BE (result < 0, 0))
3616 /* If all characters are consumed, go to next node. */
3620 /* Some characters remain, create a new group. */
3623 bitset_copy (dests_ch[ndests], accepts);
3624 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3625 if (BE (err != REG_NOERROR, 0))
3628 bitset_empty (accepts);
3633 for (j = 0; j < ndests; ++j)
3634 re_node_set_free (dests_node + j);
3638 #ifdef RE_ENABLE_I18N
3639 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3640 Return the number of the bytes the node accepts.
3641 STR_IDX is the current index of the input string.
3643 This function handles the nodes which can accept one character, or
3644 one collating element like '.', '[a-z]', opposite to the other nodes
3645 can only accept one byte. */
3649 check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3650 const re_string_t *input, int str_idx)
3652 const re_token_t *node = dfa->nodes + node_idx;
3653 int char_len, elem_len;
3656 if (BE (node->type == OP_UTF8_PERIOD, 0))
3658 unsigned char c = re_string_byte_at (input, str_idx), d;
3659 if (BE (c < 0xc2, 1))
3662 if (str_idx + 2 > input->len)
3665 d = re_string_byte_at (input, str_idx + 1);
3667 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3671 if (c == 0xe0 && d < 0xa0)
3677 if (c == 0xf0 && d < 0x90)
3683 if (c == 0xf8 && d < 0x88)
3689 if (c == 0xfc && d < 0x84)
3695 if (str_idx + char_len > input->len)
3698 for (i = 1; i < char_len; ++i)
3700 d = re_string_byte_at (input, str_idx + i);
3701 if (d < 0x80 || d > 0xbf)
3707 char_len = re_string_char_size_at (input, str_idx);
3708 if (node->type == OP_PERIOD)
3712 /* FIXME: I don't think this if is needed, as both '\n'
3713 and '\0' are char_len == 1. */
3714 /* '.' accepts any one character except the following two cases. */
3715 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3716 re_string_byte_at (input, str_idx) == '\n') ||
3717 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3718 re_string_byte_at (input, str_idx) == '\0'))
3723 elem_len = re_string_elem_size_at (input, str_idx);
3724 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3727 if (node->type == COMPLEX_BRACKET)
3729 const re_charset_t *cset = node->opr.mbcset;
3731 const unsigned char *pin
3732 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3737 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3738 ? re_string_wchar_at (input, str_idx) : 0);
3740 /* match with multibyte character? */
3741 for (i = 0; i < cset->nmbchars; ++i)
3742 if (wc == cset->mbchars[i])
3744 match_len = char_len;
3745 goto check_node_accept_bytes_match;
3747 /* match with character_class? */
3748 for (i = 0; i < cset->nchar_classes; ++i)
3750 wctype_t wt = cset->char_classes[i];
3751 if (__iswctype (wc, wt))
3753 match_len = char_len;
3754 goto check_node_accept_bytes_match;
3759 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3762 unsigned int in_collseq = 0;
3763 const int32_t *table, *indirect;
3764 const unsigned char *weights, *extra;
3765 const char *collseqwc;
3767 /* This #include defines a local function! */
3768 # include <locale/weight.h>
3770 /* match with collating_symbol? */
3771 if (cset->ncoll_syms)
3772 extra = (const unsigned char *)
3773 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3774 for (i = 0; i < cset->ncoll_syms; ++i)
3776 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3777 /* Compare the length of input collating element and
3778 the length of current collating element. */
3779 if (*coll_sym != elem_len)
3781 /* Compare each bytes. */
3782 for (j = 0; j < *coll_sym; j++)
3783 if (pin[j] != coll_sym[1 + j])
3787 /* Match if every bytes is equal. */
3789 goto check_node_accept_bytes_match;
3795 if (elem_len <= char_len)
3797 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3798 in_collseq = __collseq_table_lookup (collseqwc, wc);
3801 in_collseq = find_collation_sequence_value (pin, elem_len);
3803 /* match with range expression? */
3804 for (i = 0; i < cset->nranges; ++i)
3805 if (cset->range_starts[i] <= in_collseq
3806 && in_collseq <= cset->range_ends[i])
3808 match_len = elem_len;
3809 goto check_node_accept_bytes_match;
3812 /* match with equivalence_class? */
3813 if (cset->nequiv_classes)
3815 const unsigned char *cp = pin;
3816 table = (const int32_t *)
3817 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3818 weights = (const unsigned char *)
3819 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3820 extra = (const unsigned char *)
3821 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3822 indirect = (const int32_t *)
3823 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3824 idx = findidx (&cp);
3826 for (i = 0; i < cset->nequiv_classes; ++i)
3828 int32_t equiv_class_idx = cset->equiv_classes[i];
3829 size_t weight_len = weights[idx];
3830 if (weight_len == weights[equiv_class_idx])
3833 while (cnt <= weight_len
3834 && (weights[equiv_class_idx + 1 + cnt]
3835 == weights[idx + 1 + cnt]))
3837 if (cnt > weight_len)
3839 match_len = elem_len;
3840 goto check_node_accept_bytes_match;
3849 /* match with range expression? */
3851 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3853 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3856 for (i = 0; i < cset->nranges; ++i)
3858 cmp_buf[0] = cset->range_starts[i];
3859 cmp_buf[4] = cset->range_ends[i];
3860 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3861 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3863 match_len = char_len;
3864 goto check_node_accept_bytes_match;
3868 check_node_accept_bytes_match:
3869 if (!cset->non_match)
3876 return (elem_len > char_len) ? elem_len : char_len;
3884 find_collation_sequence_value (mbs, mbs_len)
3885 const unsigned char *mbs;
3888 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3893 /* No valid character. Match it as a single byte character. */
3894 const unsigned char *collseq = (const unsigned char *)
3895 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3896 return collseq[mbs[0]];
3903 const unsigned char *extra = (const unsigned char *)
3904 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3905 int32_t extrasize = (const unsigned char *)
3906 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3908 for (idx = 0; idx < extrasize;)
3910 int mbs_cnt, found = 0;
3911 int32_t elem_mbs_len;
3912 /* Skip the name of collating element name. */
3913 idx = idx + extra[idx] + 1;
3914 elem_mbs_len = extra[idx++];
3915 if (mbs_len == elem_mbs_len)
3917 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3918 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3920 if (mbs_cnt == elem_mbs_len)
3921 /* Found the entry. */
3924 /* Skip the byte sequence of the collating element. */
3925 idx += elem_mbs_len;
3926 /* Adjust for the alignment. */
3927 idx = (idx + 3) & ~3;
3928 /* Skip the collation sequence value. */
3929 idx += sizeof (uint32_t);
3930 /* Skip the wide char sequence of the collating element. */
3931 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3932 /* If we found the entry, return the sequence value. */
3934 return *(uint32_t *) (extra + idx);
3935 /* Skip the collation sequence value. */
3936 idx += sizeof (uint32_t);
3942 #endif /* RE_ENABLE_I18N */
3944 /* Check whether the node accepts the byte which is IDX-th
3945 byte of the INPUT. */
3949 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3953 ch = re_string_byte_at (&mctx->input, idx);
3957 if (node->opr.c != ch)
3961 case SIMPLE_BRACKET:
3962 if (!bitset_contain (node->opr.sbcset, ch))
3966 #ifdef RE_ENABLE_I18N
3967 case OP_UTF8_PERIOD:
3973 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3974 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3982 if (node->constraint)
3984 /* The node has constraints. Check whether the current context
3985 satisfies the constraints. */
3986 unsigned int context = re_string_context_at (&mctx->input, idx,
3988 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3995 /* Extend the buffers, if the buffers have run out. */
3997 static reg_errcode_t
3999 extend_buffers (re_match_context_t *mctx)
4002 re_string_t *pstr = &mctx->input;
4004 /* Double the lengthes of the buffers. */
4005 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4006 if (BE (ret != REG_NOERROR, 0))
4009 if (mctx->state_log != NULL)
4011 /* And double the length of state_log. */
4012 /* XXX We have no indication of the size of this buffer. If this
4013 allocation fail we have no indication that the state_log array
4014 does not have the right size. */
4015 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4016 pstr->bufs_len + 1);
4017 if (BE (new_array == NULL, 0))
4019 mctx->state_log = new_array;
4022 /* Then reconstruct the buffers. */
4025 #ifdef RE_ENABLE_I18N
4026 if (pstr->mb_cur_max > 1)
4028 ret = build_wcs_upper_buffer (pstr);
4029 if (BE (ret != REG_NOERROR, 0))
4033 #endif /* RE_ENABLE_I18N */
4034 build_upper_buffer (pstr);
4038 #ifdef RE_ENABLE_I18N
4039 if (pstr->mb_cur_max > 1)
4040 build_wcs_buffer (pstr);
4042 #endif /* RE_ENABLE_I18N */
4044 if (pstr->trans != NULL)
4045 re_string_translate_buffer (pstr);
4052 /* Functions for matching context. */
4054 /* Initialize MCTX. */
4056 static reg_errcode_t
4058 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4060 mctx->eflags = eflags;
4061 mctx->match_last = -1;
4064 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4065 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4066 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4069 /* Already zero-ed by the caller.
4071 mctx->bkref_ents = NULL;
4072 mctx->nbkref_ents = 0;
4073 mctx->nsub_tops = 0; */
4074 mctx->abkref_ents = n;
4075 mctx->max_mb_elem_len = 1;
4076 mctx->asub_tops = n;
4080 /* Clean the entries which depend on the current input in MCTX.
4081 This function must be invoked when the matcher changes the start index
4082 of the input, or changes the input string. */
4086 match_ctx_clean (re_match_context_t *mctx)
4089 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4092 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4093 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4095 re_sub_match_last_t *last = top->lasts[sl_idx];
4096 re_free (last->path.array);
4099 re_free (top->lasts);
4102 re_free (top->path->array);
4103 re_free (top->path);
4108 mctx->nsub_tops = 0;
4109 mctx->nbkref_ents = 0;
4112 /* Free all the memory associated with MCTX. */
4116 match_ctx_free (re_match_context_t *mctx)
4118 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4119 match_ctx_clean (mctx);
4120 re_free (mctx->sub_tops);
4121 re_free (mctx->bkref_ents);
4124 /* Add a new backreference entry to MCTX.
4125 Note that we assume that caller never call this function with duplicate
4126 entry, and call with STR_IDX which isn't smaller than any existing entry.
4129 static reg_errcode_t
4131 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx,
4134 if (mctx->nbkref_ents >= mctx->abkref_ents)
4136 struct re_backref_cache_entry* new_entry;
4137 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4138 mctx->abkref_ents * 2);
4139 if (BE (new_entry == NULL, 0))
4141 re_free (mctx->bkref_ents);
4144 mctx->bkref_ents = new_entry;
4145 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4146 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4147 mctx->abkref_ents *= 2;
4149 if (mctx->nbkref_ents > 0
4150 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4151 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4153 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4154 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4155 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4156 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4158 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4159 If bit N is clear, means that this entry won't epsilon-transition to
4160 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4161 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4164 A backreference does not epsilon-transition unless it is empty, so set
4165 to all zeros if FROM != TO. */
4166 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4167 = (from == to ? ~0 : 0);
4169 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4170 if (mctx->max_mb_elem_len < to - from)
4171 mctx->max_mb_elem_len = to - from;
4175 /* Search for the first entry which has the same str_idx, or -1 if none is
4176 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4180 search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
4182 int left, right, mid, last;
4183 last = right = mctx->nbkref_ents;
4184 for (left = 0; left < right;)
4186 mid = (left + right) / 2;
4187 if (mctx->bkref_ents[mid].str_idx < str_idx)
4192 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4198 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4201 static reg_errcode_t
4203 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4206 assert (mctx->sub_tops != NULL);
4207 assert (mctx->asub_tops > 0);
4209 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4211 int new_asub_tops = mctx->asub_tops * 2;
4212 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4213 re_sub_match_top_t *,
4215 if (BE (new_array == NULL, 0))
4217 mctx->sub_tops = new_array;
4218 mctx->asub_tops = new_asub_tops;
4220 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4221 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4223 mctx->sub_tops[mctx->nsub_tops]->node = node;
4224 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4228 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4229 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4231 static re_sub_match_last_t *
4233 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4235 re_sub_match_last_t *new_entry;
4236 if (BE (subtop->nlasts == subtop->alasts, 0))
4238 int new_alasts = 2 * subtop->alasts + 1;
4239 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4240 re_sub_match_last_t *,
4242 if (BE (new_array == NULL, 0))
4244 subtop->lasts = new_array;
4245 subtop->alasts = new_alasts;
4247 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4248 if (BE (new_entry != NULL, 1))
4250 subtop->lasts[subtop->nlasts] = new_entry;
4251 new_entry->node = node;
4252 new_entry->str_idx = str_idx;
4260 sift_ctx_init (re_sift_context_t *sctx,
4261 re_dfastate_t **sifted_sts,
4262 re_dfastate_t **limited_sts,
4263 int last_node, int last_str_idx)
4265 sctx->sifted_states = sifted_sts;
4266 sctx->limited_states = limited_sts;
4267 sctx->last_node = last_node;
4268 sctx->last_str_idx = last_str_idx;
4269 re_node_set_init_empty (&sctx->limits);