1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to)
27 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 Idx str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 Idx node, Idx str_idx)
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, Idx last_node,
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, Idx length,
40 Idx start, Idx last_start, Idx stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, Idx length1,
45 const char *string2, Idx length2,
46 Idx start, regoff_t range,
47 struct re_registers *regs,
48 Idx stop, bool ret_len) internal_function;
49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, Idx length, Idx start,
51 regoff_t range, Idx stop,
52 struct re_registers *regs,
53 bool ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 Idx nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
58 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
61 static Idx check_halt_state_context (const re_match_context_t *mctx,
62 const re_dfastate_t *state, Idx idx)
64 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
65 regmatch_t *prev_idx_match, Idx cur_node,
66 Idx cur_idx, Idx nmatch) internal_function;
67 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
68 Idx str_idx, Idx dest_node, Idx nregs,
70 re_node_set *eps_via_nodes) internal_function;
71 static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 bool fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
81 #endif /* RE_ENABLE_I18N */
82 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
83 re_sift_context_t *sctx) internal_function;
84 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
85 re_sift_context_t *sctx, Idx str_idx,
86 re_node_set *cur_dest) internal_function;
87 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
88 re_sift_context_t *sctx,
90 re_node_set *dest_nodes) internal_function;
91 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
92 re_node_set *dest_nodes,
93 const re_node_set *candidates) internal_function;
94 static bool check_dst_limits (const re_match_context_t *mctx,
95 const re_node_set *limits,
96 Idx dst_node, Idx dst_idx, Idx src_node,
97 Idx src_idx) internal_function;
98 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
99 int boundaries, Idx subexp_idx,
100 Idx from_node, Idx bkref_idx) internal_function;
101 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
102 Idx limit, Idx subexp_idx,
103 Idx node, Idx str_idx,
104 Idx bkref_idx) internal_function;
105 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
106 re_node_set *dest_nodes,
107 const re_node_set *candidates,
109 struct re_backref_cache_entry *bkref_ents,
110 Idx str_idx) internal_function;
111 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
112 re_sift_context_t *sctx,
113 Idx str_idx, const re_node_set *candidates) internal_function;
114 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
115 re_dfastate_t **src, Idx num) internal_function;
116 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
117 re_match_context_t *mctx) internal_function;
118 static re_dfastate_t *transit_state (reg_errcode_t *err,
119 re_match_context_t *mctx,
120 re_dfastate_t *state) internal_function;
121 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
122 re_match_context_t *mctx,
123 re_dfastate_t *next_state) internal_function;
124 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
125 re_node_set *cur_nodes,
126 Idx str_idx) internal_function;
128 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *pstate) internal_function;
132 #ifdef RE_ENABLE_I18N
133 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
134 re_dfastate_t *pstate) internal_function;
135 #endif /* RE_ENABLE_I18N */
136 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
137 const re_node_set *nodes) internal_function;
138 static reg_errcode_t get_subexp (re_match_context_t *mctx,
139 Idx bkref_node, Idx bkref_str_idx) internal_function;
140 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
141 const re_sub_match_top_t *sub_top,
142 re_sub_match_last_t *sub_last,
143 Idx bkref_node, Idx bkref_str) internal_function;
144 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
145 Idx subexp_idx, int type) internal_function;
146 static reg_errcode_t check_arrival (re_match_context_t *mctx,
147 state_array_t *path, Idx top_node,
148 Idx top_str, Idx last_node, Idx last_str,
149 int type) internal_function;
150 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
152 re_node_set *cur_nodes,
153 re_node_set *next_nodes) internal_function;
154 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
155 re_node_set *cur_nodes,
156 Idx ex_subexp, int type) internal_function;
157 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
158 re_node_set *dst_nodes,
159 Idx target, Idx ex_subexp,
160 int type) internal_function;
161 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
162 re_node_set *cur_nodes, Idx cur_str,
163 Idx subexp_num, int type) internal_function;
164 static bool build_trtable (re_dfa_t *dfa,
165 re_dfastate_t *state) internal_function;
166 #ifdef RE_ENABLE_I18N
167 static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
168 const re_string_t *input, Idx idx) internal_function;
170 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
171 size_t name_len) internal_function;
173 #endif /* RE_ENABLE_I18N */
174 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
175 const re_dfastate_t *state,
176 re_node_set *states_node,
177 bitset *states_ch) internal_function;
178 static bool check_node_accept (const re_match_context_t *mctx,
179 const re_token_t *node, Idx idx)
181 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
183 /* Entry point for POSIX code. */
185 /* regexec searches for a given pattern, specified by PREG, in the
188 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
189 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
190 least NMATCH elements, and we set them to the offsets of the
191 corresponding matched substrings.
193 EFLAGS specifies `execution flags' which affect matching: if
194 REG_NOTBOL is set, then ^ does not match at the beginning of the
195 string; if REG_NOTEOL is set, then $ does not match at the end.
197 We return 0 if we find a match and REG_NOMATCH if not. */
200 regexec (const regex_t *__restrict preg, const char *__restrict string,
201 size_t nmatch, regmatch_t pmatch[], int eflags)
206 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
209 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
212 if (eflags & REG_STARTEND)
214 start = pmatch[0].rm_so;
215 length = pmatch[0].rm_eo;
220 length = strlen (string);
223 __libc_lock_lock (dfa->lock);
225 err = re_search_internal (preg, string, length, start, length,
226 length, 0, NULL, eflags);
228 err = re_search_internal (preg, string, length, start, length,
229 length, nmatch, pmatch, eflags);
230 __libc_lock_unlock (dfa->lock);
231 return err != REG_NOERROR;
235 # include <shlib-compat.h>
236 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
238 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
239 __typeof__ (__regexec) __compat_regexec;
242 attribute_compat_text_section
243 __compat_regexec (const regex_t *__restrict preg,
244 const char *__restrict string, size_t nmatch,
245 regmatch_t pmatch[], int eflags)
247 return regexec (preg, string, nmatch, pmatch,
248 eflags & (REG_NOTBOL | REG_NOTEOL));
250 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
254 /* Entry points for GNU code. */
256 /* re_match, re_search, re_match_2, re_search_2
258 The former two functions operate on STRING with length LENGTH,
259 while the later two operate on concatenation of STRING1 and STRING2
260 with lengths LENGTH1 and LENGTH2, respectively.
262 re_match() matches the compiled pattern in BUFP against the string,
263 starting at index START.
265 re_search() first tries matching at index START, then it tries to match
266 starting from index START + 1, and so on. The last start position tried
267 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
270 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
271 the first STOP characters of the concatenation of the strings should be
274 If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
275 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
276 computed relative to the concatenation, not relative to the individual
279 On success, re_match* functions return the length of the match, re_search*
280 return the position of the start of the match. Return value -1 means no
281 match was found and -2 indicates an internal error. */
284 re_match (struct re_pattern_buffer *bufp, const char *string,
285 Idx length, Idx start, struct re_registers *regs)
287 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
290 weak_alias (__re_match, re_match)
294 re_search (struct re_pattern_buffer *bufp, const char *string,
295 Idx length, Idx start, regoff_t range, struct re_registers *regs)
297 return re_search_stub (bufp, string, length, start, range, length, regs,
301 weak_alias (__re_search, re_search)
305 re_match_2 (struct re_pattern_buffer *bufp,
306 const char *string1, Idx length1,
307 const char *string2, Idx length2,
308 Idx start, struct re_registers *regs, Idx stop)
310 return re_search_2_stub (bufp, string1, length1, string2, length2,
311 start, 0, regs, stop, true);
314 weak_alias (__re_match_2, re_match_2)
318 re_search_2 (struct re_pattern_buffer *bufp,
319 const char *string1, Idx length1,
320 const char *string2, Idx length2,
321 Idx start, regoff_t range, struct re_registers *regs, Idx stop)
323 return re_search_2_stub (bufp, string1, length1, string2, length2,
324 start, range, regs, stop, false);
327 weak_alias (__re_search_2, re_search_2)
332 re_search_2_stub (struct re_pattern_buffer *bufp,
333 const char *string1, Idx length1,
334 const char *string2, Idx length2,
335 Idx start, regoff_t range, struct re_registers *regs,
336 Idx stop, bool ret_len)
340 Idx len = length1 + length2;
343 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
346 /* Concatenate the strings. */
350 s = re_malloc (char, len);
352 if (BE (s == NULL, 0))
354 memcpy (s, string1, length1);
355 memcpy (s + length1, string2, length2);
363 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
369 /* The parameters have the same meaning as those of re_search.
370 Additional parameters:
371 If RET_LEN is true the length of the match is returned (re_match style);
372 otherwise the position of the match is returned. */
376 re_search_stub (struct re_pattern_buffer *bufp,
377 const char *string, Idx length,
378 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
381 reg_errcode_t result;
387 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
389 Idx last_start = start + range;
391 /* Check for out-of-range. */
392 if (BE (start < 0 || start > length, 0))
394 if (sizeof start < sizeof range)
396 regoff_t length_offset = length;
397 regoff_t start_offset = start;
398 if (BE (length_offset - start_offset < range, 0))
400 else if (BE (range < - start_offset, 0))
405 if (BE ((last_start < start) != (range < 0), 0))
407 /* Overflow occurred when computing last_start; substitute
408 the extreme value. */
409 last_start = range < 0 ? 0 : length;
413 if (BE (length < last_start, 0))
415 else if (BE (last_start < 0, 0))
420 __libc_lock_lock (dfa->lock);
422 eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
423 eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
425 /* Compile fastmap if we haven't yet. */
426 if (start < last_start && bufp->re_fastmap != NULL
427 && !bufp->re_fastmap_accurate)
428 re_compile_fastmap (bufp);
430 if (BE (bufp->re_no_sub, 0))
433 /* We need at least 1 register. */
436 else if (BE (bufp->re_regs_allocated == REG_FIXED
437 && regs->rm_num_regs <= bufp->re_nsub, 0))
439 nregs = regs->rm_num_regs;
440 if (BE (nregs < 1, 0))
442 /* Nothing can be copied to regs. */
448 nregs = bufp->re_nsub + 1;
449 pmatch = re_xmalloc (regmatch_t, nregs);
450 if (BE (pmatch == NULL, 0))
456 result = re_search_internal (bufp, string, length, start, last_start, stop,
457 nregs, pmatch, eflags);
461 /* I hope we needn't fill ther regs with -1's when no match was found. */
462 if (result != REG_NOERROR)
464 else if (regs != NULL)
466 /* If caller wants register contents data back, copy them. */
467 bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
468 bufp->re_regs_allocated);
469 if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
473 if (BE (rval == 0, 1))
477 assert (pmatch[0].rm_so == start);
478 rval = pmatch[0].rm_eo - start;
481 rval = pmatch[0].rm_so;
485 __libc_lock_unlock (dfa->lock);
491 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
494 int rval = REG_REALLOCATE;
496 Idx need_regs = nregs + 1;
497 /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
500 /* Have the register data arrays been allocated? */
501 if (regs_allocated == REG_UNALLOCATED)
502 { /* No. So allocate them with malloc. */
503 regs->rm_start = re_xmalloc (regoff_t, need_regs);
504 regs->rm_end = re_malloc (regoff_t, need_regs);
505 if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
506 return REG_UNALLOCATED;
507 regs->rm_num_regs = need_regs;
509 else if (regs_allocated == REG_REALLOCATE)
510 { /* Yes. If we need more elements than were already
511 allocated, reallocate them. If we need fewer, just
513 if (BE (need_regs > regs->rm_num_regs, 0))
515 regoff_t *new_start =
516 re_xrealloc (regs->rm_start, regoff_t, need_regs);
517 regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
518 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
519 return REG_UNALLOCATED;
520 regs->rm_start = new_start;
521 regs->rm_end = new_end;
522 regs->rm_num_regs = need_regs;
527 assert (regs_allocated == REG_FIXED);
528 /* This function may not be called with REG_FIXED and nregs too big. */
529 assert (regs->rm_num_regs >= nregs);
534 for (i = 0; i < nregs; ++i)
536 regs->rm_start[i] = pmatch[i].rm_so;
537 regs->rm_end[i] = pmatch[i].rm_eo;
539 for ( ; i < regs->rm_num_regs; ++i)
540 regs->rm_start[i] = regs->rm_end[i] = -1;
545 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
546 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
547 this memory for recording register information. STARTS and ENDS
548 must be allocated using the malloc library routine, and must each
549 be at least NUM_REGS * sizeof (regoff_t) bytes long.
551 If NUM_REGS == 0, then subsequent matches should allocate their own
554 Unless this function is called, the first search or match using
555 PATTERN_BUFFER will allocate its own register data, without
556 freeing the old data. */
559 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
560 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
564 bufp->re_regs_allocated = REG_REALLOCATE;
565 regs->rm_num_regs = num_regs;
566 regs->rm_start = starts;
571 bufp->re_regs_allocated = REG_UNALLOCATED;
572 regs->rm_num_regs = 0;
573 regs->rm_start = regs->rm_end = NULL;
577 weak_alias (__re_set_registers, re_set_registers)
580 /* Entry points compatible with 4.2 BSD regex library. We don't define
581 them unless specifically requested. */
583 #if defined _REGEX_RE_COMP || defined _LIBC
588 re_exec (const char *s)
590 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
592 #endif /* _REGEX_RE_COMP */
594 /* Internal entry point. */
596 /* Searches for a compiled pattern PREG in the string STRING, whose
597 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
598 meaning as with regexec. LAST_START is START + RANGE, where
599 START and RANGE have the same meaning as with re_search.
600 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
601 otherwise return the error code.
602 Note: We assume front end functions already check ranges.
603 (0 <= LAST_START && LAST_START <= LENGTH) */
607 re_search_internal (const regex_t *preg,
608 const char *string, Idx length,
609 Idx start, Idx last_start, Idx stop,
610 size_t nmatch, regmatch_t pmatch[],
614 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
615 Idx left_lim, right_lim;
617 bool fl_longest_match;
619 Idx match_first, match_last = REG_MISSING;
623 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
624 re_match_context_t mctx = { .dfa = dfa };
626 re_match_context_t mctx;
628 char *fastmap = ((preg->re_fastmap != NULL && preg->re_fastmap_accurate
629 && start != last_start && !preg->re_can_be_null)
630 ? preg->re_fastmap : NULL);
631 unsigned REG_TRANSLATE_TYPE t =
632 (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
634 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
635 memset (&mctx, '\0', sizeof (re_match_context_t));
639 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
640 nmatch -= extra_nmatch;
642 /* Check if the DFA haven't been compiled. */
643 if (BE (preg->re_used == 0 || dfa->init_state == NULL
644 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
645 || dfa->init_state_begbuf == NULL, 0))
649 /* We assume front-end functions already check them. */
650 assert (0 <= last_start && last_start <= length);
653 /* If initial states with non-begbuf contexts have no elements,
654 the regex must be anchored. If preg->re_newline_anchor is set,
655 we'll never use init_state_nl, so do not check it. */
656 if (dfa->init_state->nodes.nelem == 0
657 && dfa->init_state_word->nodes.nelem == 0
658 && (dfa->init_state_nl->nodes.nelem == 0
659 || !preg->re_newline_anchor))
661 if (start != 0 && last_start != 0)
663 start = last_start = 0;
666 /* We must check the longest matching, if nmatch > 0. */
667 fl_longest_match = (nmatch != 0 || dfa->nbackref);
669 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
671 preg->re_syntax & REG_IGNORE_CASE, dfa);
672 if (BE (err != REG_NOERROR, 0))
674 mctx.input.stop = stop;
675 mctx.input.raw_stop = stop;
676 mctx.input.newline_anchor = preg->re_newline_anchor;
678 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
679 if (BE (err != REG_NOERROR, 0))
682 /* We will log all the DFA states through which the dfa pass,
683 if nmatch > 1, or this dfa has "multibyte node", which is a
684 back-reference or a node which can accept multibyte character or
685 multi character collating element. */
686 if (nmatch > 1 || dfa->has_mb_node)
688 mctx.state_log = re_xmalloc (re_dfastate_t *, mctx.input.bufs_len + 1);
689 if (BE (mctx.state_log == NULL, 0))
696 mctx.state_log = NULL;
699 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
700 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
702 /* Check incrementally whether of not the input string match. */
703 incr = (last_start < start) ? -1 : 1;
704 left_lim = (last_start < start) ? last_start : start;
705 right_lim = (last_start < start) ? start : last_start;
706 sb = dfa->mb_cur_max == 1;
709 ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
710 | (start <= last_start ? 2 : 0)
711 | (t != NULL ? 1 : 0))
714 for (;; match_first += incr)
717 if (match_first < left_lim || right_lim < match_first)
720 /* Advance as rapidly as possible through the string, until we
721 find a plausible place to start matching. This may be done
722 with varying efficiency, so there are various possibilities:
723 only the most common of them are specialized, in order to
724 save on code size. We use a switch statement for speed. */
732 /* Fastmap with single-byte translation, match forward. */
733 while (BE (match_first < right_lim, 1)
734 && !fastmap[t[(unsigned char) string[match_first]]])
736 goto forward_match_found_start_or_reached_end;
739 /* Fastmap without translation, match forward. */
740 while (BE (match_first < right_lim, 1)
741 && !fastmap[(unsigned char) string[match_first]])
744 forward_match_found_start_or_reached_end:
745 if (BE (match_first == right_lim, 0))
747 ch = match_first >= length
748 ? 0 : (unsigned char) string[match_first];
749 if (!fastmap[t ? t[ch] : ch])
756 /* Fastmap without multi-byte translation, match backwards. */
757 while (match_first >= left_lim)
759 ch = match_first >= length
760 ? 0 : (unsigned char) string[match_first];
761 if (fastmap[t ? t[ch] : ch])
765 if (match_first < left_lim)
770 /* In this case, we can't determine easily the current byte,
771 since it might be a component byte of a multibyte
772 character. Then we use the constructed buffer instead. */
775 /* If MATCH_FIRST is out of the valid range, reconstruct the
777 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
778 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
780 err = re_string_reconstruct (&mctx.input, match_first,
782 if (BE (err != REG_NOERROR, 0))
785 offset = match_first - mctx.input.raw_mbs_idx;
787 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
788 Note that MATCH_FIRST must not be smaller than 0. */
789 ch = (match_first >= length
790 ? 0 : re_string_byte_at (&mctx.input, offset));
794 if (match_first < left_lim || match_first > right_lim)
803 /* Reconstruct the buffers so that the matcher can assume that
804 the matching starts from the beginning of the buffer. */
805 err = re_string_reconstruct (&mctx.input, match_first, eflags);
806 if (BE (err != REG_NOERROR, 0))
809 #ifdef RE_ENABLE_I18N
810 /* Don't consider this char as a possible match start if it part,
811 yet isn't the head, of a multibyte character. */
812 if (!sb && !re_string_first_byte (&mctx.input, 0))
816 /* It seems to be appropriate one, then use the matcher. */
817 /* We assume that the matching starts from 0. */
818 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
819 match_last = check_matching (&mctx, fl_longest_match,
820 start <= last_start ? &match_first : NULL);
821 if (match_last != REG_MISSING)
823 if (BE (match_last == REG_ERROR, 0))
830 mctx.match_last = match_last;
831 if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
833 re_dfastate_t *pstate = mctx.state_log[match_last];
834 mctx.last_node = check_halt_state_context (&mctx, pstate,
837 if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
840 err = prune_impossible_nodes (&mctx);
841 if (err == REG_NOERROR)
843 if (BE (err != REG_NOMATCH, 0))
845 match_last = REG_MISSING;
848 break; /* We found a match. */
852 match_ctx_clean (&mctx);
856 assert (match_last != REG_MISSING);
857 assert (err == REG_NOERROR);
860 /* Set pmatch[] if we need. */
865 /* Initialize registers. */
866 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
867 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
869 /* Set the points where matching start/end. */
871 pmatch[0].rm_eo = mctx.match_last;
872 /* FIXME: This function should fail if mctx.match_last exceeds
873 the maximum possible regoff_t value. We need a new error
874 code REG_OVERFLOW. */
876 if (!preg->re_no_sub && nmatch > 1)
878 err = set_regs (preg, &mctx, nmatch, pmatch,
879 dfa->has_plural_match && dfa->nbackref > 0);
880 if (BE (err != REG_NOERROR, 0))
884 /* At last, add the offset to the each registers, since we slided
885 the buffers so that we could assume that the matching starts
887 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
888 if (pmatch[reg_idx].rm_so != -1)
890 #ifdef RE_ENABLE_I18N
891 if (BE (mctx.input.offsets_needed != 0, 0))
893 pmatch[reg_idx].rm_so =
894 (pmatch[reg_idx].rm_so == mctx.input.valid_len
895 ? mctx.input.valid_raw_len
896 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
897 pmatch[reg_idx].rm_eo =
898 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
899 ? mctx.input.valid_raw_len
900 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
903 assert (mctx.input.offsets_needed == 0);
905 pmatch[reg_idx].rm_so += match_first;
906 pmatch[reg_idx].rm_eo += match_first;
908 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
910 pmatch[nmatch + reg_idx].rm_so = -1;
911 pmatch[nmatch + reg_idx].rm_eo = -1;
915 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
916 if (dfa->subexp_map[reg_idx] != reg_idx)
918 pmatch[reg_idx + 1].rm_so
919 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
920 pmatch[reg_idx + 1].rm_eo
921 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
926 re_free (mctx.state_log);
928 match_ctx_free (&mctx);
929 re_string_destruct (&mctx.input);
935 prune_impossible_nodes (re_match_context_t *mctx)
937 re_dfa_t *const dfa = mctx->dfa;
938 Idx halt_node, match_last;
940 re_dfastate_t **sifted_states;
941 re_dfastate_t **lim_states = NULL;
942 re_sift_context_t sctx;
944 assert (mctx->state_log != NULL);
946 match_last = mctx->match_last;
947 halt_node = mctx->last_node;
948 sifted_states = re_xmalloc (re_dfastate_t *, match_last + 1);
949 if (BE (sifted_states == NULL, 0))
956 lim_states = re_xmalloc (re_dfastate_t *, match_last + 1);
957 if (BE (lim_states == NULL, 0))
964 memset (lim_states, '\0',
965 sizeof (re_dfastate_t *) * (match_last + 1));
966 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
968 ret = sift_states_backward (mctx, &sctx);
969 re_node_set_free (&sctx.limits);
970 if (BE (ret != REG_NOERROR, 0))
972 if (sifted_states[0] != NULL || lim_states[0] != NULL)
977 if (! REG_VALID_INDEX (match_last))
982 } while (mctx->state_log[match_last] == NULL
983 || !mctx->state_log[match_last]->halt);
984 halt_node = check_halt_state_context (mctx,
985 mctx->state_log[match_last],
988 ret = merge_state_array (dfa, sifted_states, lim_states,
990 re_free (lim_states);
992 if (BE (ret != REG_NOERROR, 0))
997 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
998 ret = sift_states_backward (mctx, &sctx);
999 re_node_set_free (&sctx.limits);
1000 if (BE (ret != REG_NOERROR, 0))
1003 re_free (mctx->state_log);
1004 mctx->state_log = sifted_states;
1005 sifted_states = NULL;
1006 mctx->last_node = halt_node;
1007 mctx->match_last = match_last;
1010 re_free (sifted_states);
1011 re_free (lim_states);
1015 /* Acquire an initial state and return it.
1016 We must select appropriate initial state depending on the context,
1017 since initial states may have constraints like "\<", "^", etc.. */
1019 static inline re_dfastate_t *
1020 __attribute ((always_inline)) internal_function
1021 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1024 re_dfa_t *const dfa = mctx->dfa;
1025 if (dfa->init_state->has_constraint)
1027 unsigned int context;
1028 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1029 if (IS_WORD_CONTEXT (context))
1030 return dfa->init_state_word;
1031 else if (IS_ORDINARY_CONTEXT (context))
1032 return dfa->init_state;
1033 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1034 return dfa->init_state_begbuf;
1035 else if (IS_NEWLINE_CONTEXT (context))
1036 return dfa->init_state_nl;
1037 else if (IS_BEGBUF_CONTEXT (context))
1039 /* It is relatively rare case, then calculate on demand. */
1040 return re_acquire_state_context (err, dfa,
1041 dfa->init_state->entrance_nodes,
1045 /* Must not happen? */
1046 return dfa->init_state;
1049 return dfa->init_state;
1052 /* Check whether the regular expression match input string INPUT or not,
1053 and return the index where the matching end. Return REG_MISSING if
1054 there is no match, and return REG_ERROR in case of an error.
1055 FL_LONGEST_MATCH means we want the POSIX longest matching.
1056 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1057 next place where we may want to try matching.
1058 Note that the matcher assume that the maching starts from the current
1059 index of the buffer. */
1063 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1066 re_dfa_t *const dfa = mctx->dfa;
1069 Idx match_last = REG_MISSING;
1070 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1071 re_dfastate_t *cur_state;
1072 bool at_init_state = p_match_first != NULL;
1073 Idx next_start_idx = cur_str_idx;
1076 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1077 /* An initial state must not be NULL (invalid). */
1078 if (BE (cur_state == NULL, 0))
1080 assert (err == REG_ESPACE);
1084 if (mctx->state_log != NULL)
1086 mctx->state_log[cur_str_idx] = cur_state;
1088 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1089 later. E.g. Processing back references. */
1090 if (BE (dfa->nbackref, 0))
1092 at_init_state = false;
1093 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1094 if (BE (err != REG_NOERROR, 0))
1097 if (cur_state->has_backref)
1099 err = transit_state_bkref (mctx, &cur_state->nodes);
1100 if (BE (err != REG_NOERROR, 0))
1106 /* If the RE accepts NULL string. */
1107 if (BE (cur_state->halt, 0))
1109 if (!cur_state->has_constraint
1110 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1112 if (!fl_longest_match)
1116 match_last = cur_str_idx;
1122 while (!re_string_eoi (&mctx->input))
1124 re_dfastate_t *old_state = cur_state;
1125 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1127 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1128 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1129 && mctx->input.valid_len < mctx->input.len))
1131 err = extend_buffers (mctx);
1132 if (BE (err != REG_NOERROR, 0))
1134 assert (err == REG_ESPACE);
1139 cur_state = transit_state (&err, mctx, cur_state);
1140 if (mctx->state_log != NULL)
1141 cur_state = merge_state_with_log (&err, mctx, cur_state);
1143 if (cur_state == NULL)
1145 /* Reached the invalid state or an error. Try to recover a valid
1146 state using the state log, if available and if we have not
1147 already found a valid (even if not the longest) match. */
1148 if (BE (err != REG_NOERROR, 0))
1151 if (mctx->state_log == NULL
1152 || (match && !fl_longest_match)
1153 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1157 if (BE (at_init_state, 0))
1159 if (old_state == cur_state)
1160 next_start_idx = next_char_idx;
1162 at_init_state = false;
1165 if (cur_state->halt)
1167 /* Reached a halt state.
1168 Check the halt state can satisfy the current context. */
1169 if (!cur_state->has_constraint
1170 || check_halt_state_context (mctx, cur_state,
1171 re_string_cur_idx (&mctx->input)))
1173 /* We found an appropriate halt state. */
1174 match_last = re_string_cur_idx (&mctx->input);
1177 /* We found a match, do not modify match_first below. */
1178 p_match_first = NULL;
1179 if (!fl_longest_match)
1186 *p_match_first += next_start_idx;
1191 /* Check NODE match the current context. */
1195 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1197 re_token_type_t type = dfa->nodes[node].type;
1198 unsigned int constraint = dfa->nodes[node].constraint;
1199 if (type != END_OF_RE)
1203 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1208 /* Check the halt state STATE match the current context.
1209 Return 0 if not match, if the node, STATE has, is a halt node and
1210 match the context, return the node. */
1214 check_halt_state_context (const re_match_context_t *mctx,
1215 const re_dfastate_t *state, Idx idx)
1218 unsigned int context;
1220 assert (state->halt);
1222 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1223 for (i = 0; i < state->nodes.nelem; ++i)
1224 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1225 return state->nodes.elems[i];
1229 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1230 corresponding to the DFA).
1231 Return the destination node, and update EPS_VIA_NODES;
1232 return REG_MISSING in case of errors. */
1236 proceed_next_node (const re_match_context_t *mctx,
1237 Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1238 re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1240 re_dfa_t *const dfa = mctx->dfa;
1243 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1245 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1246 re_node_set *edests = &dfa->edests[node];
1248 ok = re_node_set_insert (eps_via_nodes, node);
1251 /* Pick up a valid destination, or return REG_MISSING if none
1253 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1255 Idx candidate = edests->elems[i];
1256 if (!re_node_set_contains (cur_nodes, candidate))
1258 if (dest_node == REG_MISSING)
1259 dest_node = candidate;
1263 /* In order to avoid infinite loop like "(a*)*", return the second
1264 epsilon-transition if the first was already considered. */
1265 if (re_node_set_contains (eps_via_nodes, dest_node))
1268 /* Otherwise, push the second epsilon-transition on the fail stack. */
1270 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1274 /* We know we are going to exit. */
1283 re_token_type_t type = dfa->nodes[node].type;
1285 #ifdef RE_ENABLE_I18N
1286 if (dfa->nodes[node].accept_mb)
1287 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1289 #endif /* RE_ENABLE_I18N */
1290 if (type == OP_BACK_REF)
1292 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1293 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1296 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1300 char *buf = (char *) re_string_get_buffer (&mctx->input);
1301 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1310 ok = re_node_set_insert (eps_via_nodes, node);
1313 dest_node = dfa->edests[node].elems[0];
1314 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1321 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1323 Idx dest_node = dfa->nexts[node];
1324 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1325 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1326 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1329 re_node_set_empty (eps_via_nodes);
1336 static reg_errcode_t
1338 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1339 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1342 Idx num = fs->num++;
1343 if (fs->num == fs->alloc)
1345 struct re_fail_stack_ent_t *new_array =
1346 re_x2realloc (fs->stack, struct re_fail_stack_ent_t, &fs->alloc);
1347 if (new_array == NULL)
1349 fs->stack = new_array;
1351 fs->stack[num].idx = str_idx;
1352 fs->stack[num].node = dest_node;
1353 fs->stack[num].regs = re_xmalloc (regmatch_t, nregs);
1354 if (fs->stack[num].regs == NULL)
1356 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1357 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1363 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1364 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1366 Idx num = --fs->num;
1367 assert (REG_VALID_INDEX (num));
1368 *pidx = fs->stack[num].idx;
1369 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1370 re_node_set_free (eps_via_nodes);
1371 re_free (fs->stack[num].regs);
1372 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1373 return fs->stack[num].node;
1376 /* Set the positions where the subexpressions are starts/ends to registers
1378 Note: We assume that pmatch[0] is already set, and
1379 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1381 static reg_errcode_t
1383 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1384 size_t nmatch, regmatch_t *pmatch, bool fl_backtrack)
1386 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1388 re_node_set eps_via_nodes;
1389 struct re_fail_stack_t *fs;
1390 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1391 regmatch_t *prev_idx_match;
1392 bool prev_idx_match_malloced = false;
1395 assert (nmatch > 1);
1396 assert (mctx->state_log != NULL);
1401 fs->stack = re_xmalloc (struct re_fail_stack_ent_t, fs->alloc);
1402 if (fs->stack == NULL)
1408 cur_node = dfa->init_node;
1409 re_node_set_init_empty (&eps_via_nodes);
1411 if (re_alloc_oversized (nmatch, sizeof (regmatch_t)))
1413 free_fail_stack_return (fs);
1416 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1417 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1420 prev_idx_match = re_malloc (regmatch_t, nmatch);
1421 if (prev_idx_match == NULL)
1423 free_fail_stack_return (fs);
1426 prev_idx_match_malloced = true;
1428 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1430 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1432 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1434 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1439 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1440 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1442 if (reg_idx == nmatch)
1444 re_node_set_free (&eps_via_nodes);
1445 if (prev_idx_match_malloced)
1446 re_free (prev_idx_match);
1447 return free_fail_stack_return (fs);
1449 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454 re_node_set_free (&eps_via_nodes);
1455 if (prev_idx_match_malloced)
1456 re_free (prev_idx_match);
1461 /* Proceed to next node. */
1462 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1463 &eps_via_nodes, fs);
1465 if (BE (! REG_VALID_INDEX (cur_node), 0))
1467 if (BE (cur_node == REG_ERROR, 0))
1469 re_node_set_free (&eps_via_nodes);
1470 if (prev_idx_match_malloced)
1471 re_free (prev_idx_match);
1472 free_fail_stack_return (fs);
1476 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1480 re_node_set_free (&eps_via_nodes);
1481 if (prev_idx_match_malloced)
1482 re_free (prev_idx_match);
1487 re_node_set_free (&eps_via_nodes);
1488 if (prev_idx_match_malloced)
1489 re_free (prev_idx_match);
1490 return free_fail_stack_return (fs);
1493 static reg_errcode_t
1495 free_fail_stack_return (struct re_fail_stack_t *fs)
1500 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1502 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1503 re_free (fs->stack[fs_idx].regs);
1505 re_free (fs->stack);
1512 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1513 Idx cur_node, Idx cur_idx, Idx nmatch)
1515 int type = dfa->nodes[cur_node].type;
1516 if (type == OP_OPEN_SUBEXP)
1518 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1520 /* We are at the first node of this sub expression. */
1521 if (reg_num < nmatch)
1523 pmatch[reg_num].rm_so = cur_idx;
1524 pmatch[reg_num].rm_eo = -1;
1527 else if (type == OP_CLOSE_SUBEXP)
1529 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1530 if (reg_num < nmatch)
1532 /* We are at the last node of this sub expression. */
1533 if (pmatch[reg_num].rm_so < cur_idx)
1535 pmatch[reg_num].rm_eo = cur_idx;
1536 /* This is a non-empty match or we are not inside an optional
1537 subexpression. Accept this right away. */
1538 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1542 if (dfa->nodes[cur_node].opt_subexp
1543 && prev_idx_match[reg_num].rm_so != -1)
1544 /* We transited through an empty match for an optional
1545 subexpression, like (a?)*, and this is not the subexp's
1546 first match. Copy back the old content of the registers
1547 so that matches of an inner subexpression are undone as
1548 well, like in ((a?))*. */
1549 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1551 /* We completed a subexpression, but it may be part of
1552 an optional one, so do not update PREV_IDX_MATCH. */
1553 pmatch[reg_num].rm_eo = cur_idx;
1559 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1560 and sift the nodes in each states according to the following rules.
1561 Updated state_log will be wrote to STATE_LOG.
1563 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1564 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1565 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1566 the LAST_NODE, we throw away the node `a'.
1567 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1568 string `s' and transit to `b':
1569 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1571 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1572 thrown away, we throw away the node `a'.
1573 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1574 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1576 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1577 we throw away the node `a'. */
1579 #define STATE_NODE_CONTAINS(state,node) \
1580 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1582 static reg_errcode_t
1584 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1588 Idx str_idx = sctx->last_str_idx;
1589 re_node_set cur_dest;
1592 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1595 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1596 transit to the last_node and the last_node itself. */
1597 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1598 if (BE (err != REG_NOERROR, 0))
1600 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1601 if (BE (err != REG_NOERROR, 0))
1604 /* Then check each states in the state_log. */
1607 /* Update counters. */
1608 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1609 if (null_cnt > mctx->max_mb_elem_len)
1611 memset (sctx->sifted_states, '\0',
1612 sizeof (re_dfastate_t *) * str_idx);
1613 re_node_set_free (&cur_dest);
1616 re_node_set_empty (&cur_dest);
1619 if (mctx->state_log[str_idx])
1621 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1622 if (BE (err != REG_NOERROR, 0))
1626 /* Add all the nodes which satisfy the following conditions:
1627 - It can epsilon transit to a node in CUR_DEST.
1629 And update state_log. */
1630 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1631 if (BE (err != REG_NOERROR, 0))
1636 re_node_set_free (&cur_dest);
1640 static reg_errcode_t
1642 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1643 Idx str_idx, re_node_set *cur_dest)
1645 re_dfa_t *const dfa = mctx->dfa;
1646 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1649 /* Then build the next sifted state.
1650 We build the next sifted state on `cur_dest', and update
1651 `sifted_states[str_idx]' with `cur_dest'.
1653 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1654 `cur_src' points the node_set of the old `state_log[str_idx]'
1655 (with the epsilon nodes pre-filtered out). */
1656 for (i = 0; i < cur_src->nelem; i++)
1658 Idx prev_node = cur_src->elems[i];
1663 re_token_type_t type = dfa->nodes[prev_node].type;
1664 assert (!IS_EPSILON_NODE (type));
1666 #ifdef RE_ENABLE_I18N
1667 /* If the node may accept `multi byte'. */
1668 if (dfa->nodes[prev_node].accept_mb)
1669 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1670 str_idx, sctx->last_str_idx);
1671 #endif /* RE_ENABLE_I18N */
1673 /* We don't check backreferences here.
1674 See update_cur_sifted_state(). */
1676 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1677 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1678 dfa->nexts[prev_node]))
1684 if (sctx->limits.nelem)
1686 Idx to_idx = str_idx + naccepted;
1687 if (check_dst_limits (mctx, &sctx->limits,
1688 dfa->nexts[prev_node], to_idx,
1689 prev_node, str_idx))
1692 ok = re_node_set_insert (cur_dest, prev_node);
1700 /* Helper functions. */
1702 static reg_errcode_t
1704 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1706 Idx top = mctx->state_log_top;
1708 if (next_state_log_idx >= mctx->input.bufs_len
1709 || (next_state_log_idx >= mctx->input.valid_len
1710 && mctx->input.valid_len < mctx->input.len))
1713 err = extend_buffers (mctx);
1714 if (BE (err != REG_NOERROR, 0))
1718 if (top < next_state_log_idx)
1720 memset (mctx->state_log + top + 1, '\0',
1721 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1722 mctx->state_log_top = next_state_log_idx;
1727 static reg_errcode_t
1729 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1734 for (st_idx = 0; st_idx < num; ++st_idx)
1736 if (dst[st_idx] == NULL)
1737 dst[st_idx] = src[st_idx];
1738 else if (src[st_idx] != NULL)
1740 re_node_set merged_set;
1741 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1742 &src[st_idx]->nodes);
1743 if (BE (err != REG_NOERROR, 0))
1745 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1746 re_node_set_free (&merged_set);
1747 if (BE (err != REG_NOERROR, 0))
1754 static reg_errcode_t
1756 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1757 Idx str_idx, re_node_set *dest_nodes)
1759 re_dfa_t *const dfa = mctx->dfa;
1761 const re_node_set *candidates;
1762 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1763 : &mctx->state_log[str_idx]->nodes);
1765 if (dest_nodes->nelem == 0)
1766 sctx->sifted_states[str_idx] = NULL;
1771 /* At first, add the nodes which can epsilon transit to a node in
1773 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1774 if (BE (err != REG_NOERROR, 0))
1777 /* Then, check the limitations in the current sift_context. */
1778 if (sctx->limits.nelem)
1780 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1781 mctx->bkref_ents, str_idx);
1782 if (BE (err != REG_NOERROR, 0))
1787 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1788 if (BE (err != REG_NOERROR, 0))
1792 if (candidates && mctx->state_log[str_idx]->has_backref)
1794 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1795 if (BE (err != REG_NOERROR, 0))
1801 static reg_errcode_t
1803 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1804 const re_node_set *candidates)
1806 reg_errcode_t err = REG_NOERROR;
1809 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1810 if (BE (err != REG_NOERROR, 0))
1813 if (!state->inveclosure.alloc)
1815 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1816 if (BE (err != REG_NOERROR, 0))
1818 for (i = 0; i < dest_nodes->nelem; i++)
1819 re_node_set_merge (&state->inveclosure,
1820 dfa->inveclosures + dest_nodes->elems[i]);
1822 return re_node_set_add_intersect (dest_nodes, candidates,
1823 &state->inveclosure);
1826 static reg_errcode_t
1828 sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1829 const re_node_set *candidates)
1833 re_node_set *inv_eclosure = dfa->inveclosures + node;
1834 re_node_set except_nodes;
1835 re_node_set_init_empty (&except_nodes);
1836 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1838 Idx cur_node = inv_eclosure->elems[ecl_idx];
1839 if (cur_node == node)
1841 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1843 Idx edst1 = dfa->edests[cur_node].elems[0];
1844 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1845 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1846 if ((!re_node_set_contains (inv_eclosure, edst1)
1847 && re_node_set_contains (dest_nodes, edst1))
1848 || (REG_VALID_NONZERO_INDEX (edst2)
1849 && !re_node_set_contains (inv_eclosure, edst2)
1850 && re_node_set_contains (dest_nodes, edst2)))
1852 err = re_node_set_add_intersect (&except_nodes, candidates,
1853 dfa->inveclosures + cur_node);
1854 if (BE (err != REG_NOERROR, 0))
1856 re_node_set_free (&except_nodes);
1862 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1864 Idx cur_node = inv_eclosure->elems[ecl_idx];
1865 if (!re_node_set_contains (&except_nodes, cur_node))
1867 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1868 re_node_set_remove_at (dest_nodes, idx);
1871 re_node_set_free (&except_nodes);
1877 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1878 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1880 re_dfa_t *const dfa = mctx->dfa;
1881 Idx lim_idx, src_pos, dst_pos;
1883 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1884 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1885 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1888 struct re_backref_cache_entry *ent;
1889 ent = mctx->bkref_ents + limits->elems[lim_idx];
1890 subexp_idx = dfa->nodes[ent->node].opr.idx;
1892 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1893 subexp_idx, dst_node, dst_idx,
1895 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1896 subexp_idx, src_node, src_idx,
1900 <src> <dst> ( <subexp> )
1901 ( <subexp> ) <src> <dst>
1902 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1903 if (src_pos == dst_pos)
1904 continue; /* This is unrelated limitation. */
1913 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1914 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1916 re_dfa_t *const dfa = mctx->dfa;
1917 re_node_set *eclosures = dfa->eclosures + from_node;
1920 /* Else, we are on the boundary: examine the nodes on the epsilon
1922 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1924 Idx node = eclosures->elems[node_idx];
1925 switch (dfa->nodes[node].type)
1928 if (bkref_idx != REG_MISSING)
1930 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1936 if (ent->node != node)
1940 < CHAR_BIT * sizeof ent->eps_reachable_subexps_map
1941 && !(ent->eps_reachable_subexps_map & (1u << subexp_idx)))
1944 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1945 OP_CLOSE_SUBEXP cases below. But, if the
1946 destination node is the same node as the source
1947 node, don't recurse because it would cause an
1948 infinite loop: a regex that exhibits this behavior
1950 dst = dfa->edests[node].elems[0];
1951 if (dst == from_node)
1955 else /* if (boundaries & 2) */
1960 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1962 if (cpos == -1 /* && (boundaries & 1) */)
1964 if (cpos == 0 && (boundaries & 2))
1968 < CHAR_BIT * sizeof ent->eps_reachable_subexps_map)
1969 ent->eps_reachable_subexps_map &= ~(1u << subexp_idx);
1971 while (ent++->more);
1975 case OP_OPEN_SUBEXP:
1976 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1980 case OP_CLOSE_SUBEXP:
1981 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1990 return (boundaries & 2) ? 1 : 0;
1995 check_dst_limits_calc_pos (const re_match_context_t *mctx,
1996 Idx limit, Idx subexp_idx,
1997 Idx from_node, Idx str_idx, Idx bkref_idx)
1999 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2002 /* If we are outside the range of the subexpression, return -1 or 1. */
2003 if (str_idx < lim->subexp_from)
2006 if (lim->subexp_to < str_idx)
2009 /* If we are within the subexpression, return 0. */
2010 boundaries = (str_idx == lim->subexp_from);
2011 boundaries |= (str_idx == lim->subexp_to) << 1;
2012 if (boundaries == 0)
2015 /* Else, examine epsilon closure. */
2016 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2017 from_node, bkref_idx);
2020 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2021 which are against limitations from DEST_NODES. */
2023 static reg_errcode_t
2025 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2026 const re_node_set *candidates, re_node_set *limits,
2027 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2030 Idx node_idx, lim_idx;
2032 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2035 struct re_backref_cache_entry *ent;
2036 ent = bkref_ents + limits->elems[lim_idx];
2038 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2039 continue; /* This is unrelated limitation. */
2041 subexp_idx = dfa->nodes[ent->node].opr.idx;
2042 if (ent->subexp_to == str_idx)
2044 Idx ops_node = REG_MISSING;
2045 Idx cls_node = REG_MISSING;
2046 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2048 Idx node = dest_nodes->elems[node_idx];
2049 re_token_type_t type = dfa->nodes[node].type;
2050 if (type == OP_OPEN_SUBEXP
2051 && subexp_idx == dfa->nodes[node].opr.idx)
2053 else if (type == OP_CLOSE_SUBEXP
2054 && subexp_idx == dfa->nodes[node].opr.idx)
2058 /* Check the limitation of the open subexpression. */
2059 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2060 if (REG_VALID_INDEX (ops_node))
2062 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2064 if (BE (err != REG_NOERROR, 0))
2068 /* Check the limitation of the close subexpression. */
2069 if (REG_VALID_INDEX (cls_node))
2070 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2072 Idx node = dest_nodes->elems[node_idx];
2073 if (!re_node_set_contains (dfa->inveclosures + node,
2075 && !re_node_set_contains (dfa->eclosures + node,
2078 /* It is against this limitation.
2079 Remove it form the current sifted state. */
2080 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2082 if (BE (err != REG_NOERROR, 0))
2088 else /* (ent->subexp_to != str_idx) */
2090 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2092 Idx node = dest_nodes->elems[node_idx];
2093 re_token_type_t type = dfa->nodes[node].type;
2094 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2096 if (subexp_idx != dfa->nodes[node].opr.idx)
2098 /* It is against this limitation.
2099 Remove it form the current sifted state. */
2100 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2102 if (BE (err != REG_NOERROR, 0))
2111 static reg_errcode_t
2113 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2114 Idx str_idx, const re_node_set *candidates)
2116 re_dfa_t *const dfa = mctx->dfa;
2119 re_sift_context_t local_sctx;
2120 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2122 if (first_idx == REG_MISSING)
2125 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2127 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2130 re_token_type_t type;
2131 struct re_backref_cache_entry *entry;
2132 node = candidates->elems[node_idx];
2133 type = dfa->nodes[node].type;
2134 /* Avoid infinite loop for the REs like "()\1+". */
2135 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2137 if (type != OP_BACK_REF)
2140 entry = mctx->bkref_ents + first_idx;
2141 enabled_idx = first_idx;
2145 Idx subexp_len, to_idx, dst_node;
2146 re_dfastate_t *cur_state;
2148 if (entry->node != node)
2150 subexp_len = entry->subexp_to - entry->subexp_from;
2151 to_idx = str_idx + subexp_len;
2152 dst_node = (subexp_len ? dfa->nexts[node]
2153 : dfa->edests[node].elems[0]);
2155 if (to_idx > sctx->last_str_idx
2156 || sctx->sifted_states[to_idx] == NULL
2157 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2158 || check_dst_limits (mctx, &sctx->limits, node,
2159 str_idx, dst_node, to_idx))
2162 if (local_sctx.sifted_states == NULL)
2165 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2166 if (BE (err != REG_NOERROR, 0))
2169 local_sctx.last_node = node;
2170 local_sctx.last_str_idx = str_idx;
2171 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2177 cur_state = local_sctx.sifted_states[str_idx];
2178 err = sift_states_backward (mctx, &local_sctx);
2179 if (BE (err != REG_NOERROR, 0))
2181 if (sctx->limited_states != NULL)
2183 err = merge_state_array (dfa, sctx->limited_states,
2184 local_sctx.sifted_states,
2186 if (BE (err != REG_NOERROR, 0))
2189 local_sctx.sifted_states[str_idx] = cur_state;
2190 re_node_set_remove (&local_sctx.limits, enabled_idx);
2192 /* mctx->bkref_ents may have changed, reload the pointer. */
2193 entry = mctx->bkref_ents + enabled_idx;
2195 while (enabled_idx++, entry++->more);
2199 if (local_sctx.sifted_states != NULL)
2201 re_node_set_free (&local_sctx.limits);
2208 #ifdef RE_ENABLE_I18N
2211 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2212 Idx node_idx, Idx str_idx, Idx max_str_idx)
2214 re_dfa_t *const dfa = mctx->dfa;
2216 /* Check the node can accept `multi byte'. */
2217 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2218 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2219 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2220 dfa->nexts[node_idx]))
2221 /* The node can't accept the `multi byte', or the
2222 destination was already thrown away, then the node
2223 could't accept the current input `multi byte'. */
2225 /* Otherwise, it is sure that the node could accept
2226 `naccepted' bytes input. */
2229 #endif /* RE_ENABLE_I18N */
2232 /* Functions for state transition. */
2234 /* Return the next state to which the current state STATE will transit by
2235 accepting the current input byte, and update STATE_LOG if necessary.
2236 If STATE can accept a multibyte char/collating element/back reference
2237 update the destination of STATE_LOG. */
2239 static re_dfastate_t *
2241 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2242 re_dfastate_t *state)
2244 re_dfastate_t **trtable;
2247 #ifdef RE_ENABLE_I18N
2248 /* If the current state can accept multibyte. */
2249 if (BE (state->accept_mb, 0))
2251 *err = transit_state_mb (mctx, state);
2252 if (BE (*err != REG_NOERROR, 0))
2255 #endif /* RE_ENABLE_I18N */
2257 /* Then decide the next state with the single byte. */
2260 /* don't use transition table */
2261 return transit_state_sb (err, mctx, state);
2264 /* Use transition table */
2265 ch = re_string_fetch_byte (&mctx->input);
2268 trtable = state->trtable;
2269 if (BE (trtable != NULL, 1))
2272 trtable = state->word_trtable;
2273 if (BE (trtable != NULL, 1))
2275 unsigned int context;
2277 = re_string_context_at (&mctx->input,
2278 re_string_cur_idx (&mctx->input) - 1,
2280 if (IS_WORD_CONTEXT (context))
2281 return trtable[ch + SBC_MAX];
2286 if (!build_trtable (mctx->dfa, state))
2292 /* Retry, we now have a transition table. */
2296 /* Update the state_log if we need */
2299 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2300 re_dfastate_t *next_state)
2302 re_dfa_t *const dfa = mctx->dfa;
2303 Idx cur_idx = re_string_cur_idx (&mctx->input);
2305 if (cur_idx > mctx->state_log_top)
2307 mctx->state_log[cur_idx] = next_state;
2308 mctx->state_log_top = cur_idx;
2310 else if (mctx->state_log[cur_idx] == 0)
2312 mctx->state_log[cur_idx] = next_state;
2316 re_dfastate_t *pstate;
2317 unsigned int context;
2318 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2319 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2320 the destination of a multibyte char/collating element/
2321 back reference. Then the next state is the union set of
2322 these destinations and the results of the transition table. */
2323 pstate = mctx->state_log[cur_idx];
2324 log_nodes = pstate->entrance_nodes;
2325 if (next_state != NULL)
2327 table_nodes = next_state->entrance_nodes;
2328 *err = re_node_set_init_union (&next_nodes, table_nodes,
2330 if (BE (*err != REG_NOERROR, 0))
2334 next_nodes = *log_nodes;
2335 /* Note: We already add the nodes of the initial state,
2336 then we don't need to add them here. */
2338 context = re_string_context_at (&mctx->input,
2339 re_string_cur_idx (&mctx->input) - 1,
2341 next_state = mctx->state_log[cur_idx]
2342 = re_acquire_state_context (err, dfa, &next_nodes, context);
2343 /* We don't need to check errors here, since the return value of
2344 this function is next_state and ERR is already set. */
2346 if (table_nodes != NULL)
2347 re_node_set_free (&next_nodes);
2350 if (BE (dfa->nbackref, 0) && next_state != NULL)
2352 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2353 later. We must check them here, since the back references in the
2354 next state might use them. */
2355 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2357 if (BE (*err != REG_NOERROR, 0))
2360 /* If the next state has back references. */
2361 if (next_state->has_backref)
2363 *err = transit_state_bkref (mctx, &next_state->nodes);
2364 if (BE (*err != REG_NOERROR, 0))
2366 next_state = mctx->state_log[cur_idx];
2373 /* Skip bytes in the input that correspond to part of a
2374 multi-byte match, then look in the log for a state
2375 from which to restart matching. */
2376 static re_dfastate_t *
2378 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2380 re_dfastate_t *cur_state = NULL;
2383 Idx max = mctx->state_log_top;
2384 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2388 if (++cur_str_idx > max)
2390 re_string_skip_bytes (&mctx->input, 1);
2392 while (mctx->state_log[cur_str_idx] == NULL);
2394 cur_state = merge_state_with_log (err, mctx, NULL);
2396 while (*err == REG_NOERROR && cur_state == NULL);
2400 /* Helper functions for transit_state. */
2402 /* From the node set CUR_NODES, pick up the nodes whose types are
2403 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2404 expression. And register them to use them later for evaluating the
2405 correspoding back references. */
2407 static reg_errcode_t
2409 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2412 re_dfa_t *const dfa = mctx->dfa;
2416 /* TODO: This isn't efficient.
2417 Because there might be more than one nodes whose types are
2418 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2421 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2423 Idx node = cur_nodes->elems[node_idx];
2424 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2425 && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map
2426 && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx))
2428 err = match_ctx_add_subtop (mctx, node, str_idx);
2429 if (BE (err != REG_NOERROR, 0))
2437 /* Return the next state to which the current state STATE will transit by
2438 accepting the current input byte. */
2440 static re_dfastate_t *
2441 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2442 re_dfastate_t *state)
2444 re_dfa_t *const dfa = mctx->dfa;
2445 re_node_set next_nodes;
2446 re_dfastate_t *next_state;
2447 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2448 unsigned int context;
2450 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2451 if (BE (*err != REG_NOERROR, 0))
2453 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2455 Idx cur_node = state->nodes.elems[node_cnt];
2456 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2458 *err = re_node_set_merge (&next_nodes,
2459 dfa->eclosures + dfa->nexts[cur_node]);
2460 if (BE (*err != REG_NOERROR, 0))
2462 re_node_set_free (&next_nodes);
2467 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2468 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2469 /* We don't need to check errors here, since the return value of
2470 this function is next_state and ERR is already set. */
2472 re_node_set_free (&next_nodes);
2473 re_string_skip_bytes (&mctx->input, 1);
2478 #ifdef RE_ENABLE_I18N
2479 static reg_errcode_t
2481 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2483 re_dfa_t *const dfa = mctx->dfa;
2487 for (i = 0; i < pstate->nodes.nelem; ++i)
2489 re_node_set dest_nodes, *new_nodes;
2490 Idx cur_node_idx = pstate->nodes.elems[i];
2493 unsigned int context;
2494 re_dfastate_t *dest_state;
2496 if (!dfa->nodes[cur_node_idx].accept_mb)
2499 if (dfa->nodes[cur_node_idx].constraint)
2501 context = re_string_context_at (&mctx->input,
2502 re_string_cur_idx (&mctx->input),
2504 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2509 /* How many bytes the node can accept? */
2510 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2511 re_string_cur_idx (&mctx->input));
2515 /* The node can accepts `naccepted' bytes. */
2516 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2517 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2518 : mctx->max_mb_elem_len);
2519 err = clean_state_log_if_needed (mctx, dest_idx);
2520 if (BE (err != REG_NOERROR, 0))
2523 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2525 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2527 dest_state = mctx->state_log[dest_idx];
2528 if (dest_state == NULL)
2529 dest_nodes = *new_nodes;
2532 err = re_node_set_init_union (&dest_nodes,
2533 dest_state->entrance_nodes, new_nodes);
2534 if (BE (err != REG_NOERROR, 0))
2537 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2538 mctx->state_log[dest_idx]
2539 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2540 if (dest_state != NULL)
2541 re_node_set_free (&dest_nodes);
2542 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2547 #endif /* RE_ENABLE_I18N */
2549 static reg_errcode_t
2551 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2553 re_dfa_t *const dfa = mctx->dfa;
2556 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2558 for (i = 0; i < nodes->nelem; ++i)
2560 Idx dest_str_idx, prev_nelem, bkc_idx;
2561 Idx node_idx = nodes->elems[i];
2562 unsigned int context;
2563 const re_token_t *node = dfa->nodes + node_idx;
2564 re_node_set *new_dest_nodes;
2566 /* Check whether `node' is a backreference or not. */
2567 if (node->type != OP_BACK_REF)
2570 if (node->constraint)
2572 context = re_string_context_at (&mctx->input, cur_str_idx,
2574 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2578 /* `node' is a backreference.
2579 Check the substring which the substring matched. */
2580 bkc_idx = mctx->nbkref_ents;
2581 err = get_subexp (mctx, node_idx, cur_str_idx);
2582 if (BE (err != REG_NOERROR, 0))
2585 /* And add the epsilon closures (which is `new_dest_nodes') of
2586 the backreference to appropriate state_log. */
2588 assert (dfa->nexts[node_idx] != REG_MISSING);
2590 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2593 re_dfastate_t *dest_state;
2594 struct re_backref_cache_entry *bkref_ent;
2595 bkref_ent = mctx->bkref_ents + bkc_idx;
2596 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2598 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2599 new_dest_nodes = (subexp_len == 0
2600 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2601 : dfa->eclosures + dfa->nexts[node_idx]);
2602 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2603 - bkref_ent->subexp_from);
2604 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2606 dest_state = mctx->state_log[dest_str_idx];
2607 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2608 : mctx->state_log[cur_str_idx]->nodes.nelem);
2609 /* Add `new_dest_node' to state_log. */
2610 if (dest_state == NULL)
2612 mctx->state_log[dest_str_idx]
2613 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2615 if (BE (mctx->state_log[dest_str_idx] == NULL
2616 && err != REG_NOERROR, 0))
2621 re_node_set dest_nodes;
2622 err = re_node_set_init_union (&dest_nodes,
2623 dest_state->entrance_nodes,
2625 if (BE (err != REG_NOERROR, 0))
2627 re_node_set_free (&dest_nodes);
2630 mctx->state_log[dest_str_idx]
2631 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2632 re_node_set_free (&dest_nodes);
2633 if (BE (mctx->state_log[dest_str_idx] == NULL
2634 && err != REG_NOERROR, 0))
2637 /* We need to check recursively if the backreference can epsilon
2640 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2642 err = check_subexp_matching_top (mctx, new_dest_nodes,
2644 if (BE (err != REG_NOERROR, 0))
2646 err = transit_state_bkref (mctx, new_dest_nodes);
2647 if (BE (err != REG_NOERROR, 0))
2657 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2658 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2659 Note that we might collect inappropriate candidates here.
2660 However, the cost of checking them strictly here is too high, then we
2661 delay these checking for prune_impossible_nodes(). */
2663 static reg_errcode_t
2665 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2667 re_dfa_t *const dfa = mctx->dfa;
2668 Idx subexp_num, sub_top_idx;
2669 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2670 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2671 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2672 if (cache_idx != REG_MISSING)
2674 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2676 if (entry->node == bkref_node)
2677 return REG_NOERROR; /* We already checked it. */
2678 while (entry++->more);
2681 subexp_num = dfa->nodes[bkref_node].opr.idx;
2683 /* For each sub expression */
2684 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2687 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2688 re_sub_match_last_t *sub_last;
2689 Idx sub_last_idx, sl_str, bkref_str_off;
2691 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2692 continue; /* It isn't related. */
2694 sl_str = sub_top->str_idx;
2695 bkref_str_off = bkref_str_idx;
2696 /* At first, check the last node of sub expressions we already
2698 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2700 regoff_t sl_str_diff;
2701 sub_last = sub_top->lasts[sub_last_idx];
2702 sl_str_diff = sub_last->str_idx - sl_str;
2703 /* The matched string by the sub expression match with the substring
2704 at the back reference? */
2705 if (sl_str_diff > 0)
2707 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2709 /* Not enough chars for a successful match. */
2710 if (bkref_str_off + sl_str_diff > mctx->input.len)
2713 err = clean_state_log_if_needed (mctx,
2716 if (BE (err != REG_NOERROR, 0))
2718 buf = (const char *) re_string_get_buffer (&mctx->input);
2720 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2721 break; /* We don't need to search this sub expression any more. */
2723 bkref_str_off += sl_str_diff;
2724 sl_str += sl_str_diff;
2725 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2728 /* Reload buf, since the preceding call might have reallocated
2730 buf = (const char *) re_string_get_buffer (&mctx->input);
2732 if (err == REG_NOMATCH)
2734 if (BE (err != REG_NOERROR, 0))
2738 if (sub_last_idx < sub_top->nlasts)
2740 if (sub_last_idx > 0)
2742 /* Then, search for the other last nodes of the sub expression. */
2743 for (; sl_str <= bkref_str_idx; ++sl_str)
2746 regoff_t sl_str_off;
2747 const re_node_set *nodes;
2748 sl_str_off = sl_str - sub_top->str_idx;
2749 /* The matched string by the sub expression match with the substring
2750 at the back reference? */
2753 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2755 /* If we are at the end of the input, we cannot match. */
2756 if (bkref_str_off >= mctx->input.len)
2759 err = extend_buffers (mctx);
2760 if (BE (err != REG_NOERROR, 0))
2763 buf = (const char *) re_string_get_buffer (&mctx->input);
2765 if (buf [bkref_str_off++] != buf[sl_str - 1])
2766 break; /* We don't need to search this sub expression
2769 if (mctx->state_log[sl_str] == NULL)
2771 /* Does this state have a ')' of the sub expression? */
2772 nodes = &mctx->state_log[sl_str]->nodes;
2773 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2774 if (cls_node == REG_MISSING)
2776 if (sub_top->path == NULL)
2778 sub_top->path = re_calloc (state_array_t,
2779 sl_str - sub_top->str_idx + 1);
2780 if (sub_top->path == NULL)
2783 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2784 in the current context? */
2785 err = check_arrival (mctx, sub_top->path, sub_top->node,
2786 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2787 if (err == REG_NOMATCH)
2789 if (BE (err != REG_NOERROR, 0))
2791 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2792 if (BE (sub_last == NULL, 0))
2794 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2796 if (err == REG_NOMATCH)
2803 /* Helper functions for get_subexp(). */
2805 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2806 If it can arrive, register the sub expression expressed with SUB_TOP
2809 static reg_errcode_t
2811 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2812 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2816 /* Can the subexpression arrive the back reference? */
2817 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2818 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2819 if (err != REG_NOERROR)
2821 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2823 if (BE (err != REG_NOERROR, 0))
2825 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2826 return clean_state_log_if_needed (mctx, to_idx);
2829 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2830 Search '(' if FL_OPEN, or search ')' otherwise.
2831 TODO: This function isn't efficient...
2832 Because there might be more than one nodes whose types are
2833 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2839 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2840 Idx subexp_idx, int type)
2843 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2845 Idx cls_node = nodes->elems[cls_idx];
2846 const re_token_t *node = dfa->nodes + cls_node;
2847 if (node->type == type
2848 && node->opr.idx == subexp_idx)
2854 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2855 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2857 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2859 static reg_errcode_t
2861 check_arrival (re_match_context_t *mctx, state_array_t *path,
2862 Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2865 re_dfa_t *const dfa = mctx->dfa;
2867 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2868 re_dfastate_t *cur_state = NULL;
2869 re_node_set *cur_nodes, next_nodes;
2870 re_dfastate_t **backup_state_log;
2871 unsigned int context;
2873 subexp_num = dfa->nodes[top_node].opr.idx;
2874 /* Extend the buffer if we need. */
2875 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2877 re_dfastate_t **new_array;
2878 Idx old_alloc = path->alloc;
2879 Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2880 if (BE (new_alloc < old_alloc, 0))
2882 new_array = re_xrealloc (path->array, re_dfastate_t *, new_alloc);
2883 if (BE (new_array == NULL, 0))
2885 path->array = new_array;
2886 path->alloc = new_alloc;
2887 memset (new_array + old_alloc, '\0',
2888 sizeof (re_dfastate_t *) * (new_alloc - old_alloc));
2891 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2893 /* Temporary modify MCTX. */
2894 backup_state_log = mctx->state_log;
2895 backup_cur_idx = mctx->input.cur_idx;
2896 mctx->state_log = path->array;
2897 mctx->input.cur_idx = str_idx;
2899 /* Setup initial node set. */
2900 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2901 if (str_idx == top_str)
2903 err = re_node_set_init_1 (&next_nodes, top_node);
2904 if (BE (err != REG_NOERROR, 0))
2906 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2907 if (BE (err != REG_NOERROR, 0))
2909 re_node_set_free (&next_nodes);
2915 cur_state = mctx->state_log[str_idx];
2916 if (cur_state && cur_state->has_backref)
2918 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2919 if (BE ( err != REG_NOERROR, 0))
2923 re_node_set_init_empty (&next_nodes);
2925 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2927 if (next_nodes.nelem)
2929 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2931 if (BE ( err != REG_NOERROR, 0))
2933 re_node_set_free (&next_nodes);
2937 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2938 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2940 re_node_set_free (&next_nodes);
2943 mctx->state_log[str_idx] = cur_state;
2946 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2948 re_node_set_empty (&next_nodes);
2949 if (mctx->state_log[str_idx + 1])
2951 err = re_node_set_merge (&next_nodes,
2952 &mctx->state_log[str_idx + 1]->nodes);
2953 if (BE (err != REG_NOERROR, 0))
2955 re_node_set_free (&next_nodes);
2961 err = check_arrival_add_next_nodes (mctx, str_idx,
2962 &cur_state->non_eps_nodes, &next_nodes);
2963 if (BE (err != REG_NOERROR, 0))
2965 re_node_set_free (&next_nodes);
2970 if (next_nodes.nelem)
2972 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2973 if (BE (err != REG_NOERROR, 0))
2975 re_node_set_free (&next_nodes);
2978 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2980 if (BE ( err != REG_NOERROR, 0))
2982 re_node_set_free (&next_nodes);
2986 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2987 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2988 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2990 re_node_set_free (&next_nodes);
2993 mctx->state_log[str_idx] = cur_state;
2994 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2996 re_node_set_free (&next_nodes);
2997 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2998 : &mctx->state_log[last_str]->nodes);
2999 path->next_idx = str_idx;
3002 mctx->state_log = backup_state_log;
3003 mctx->input.cur_idx = backup_cur_idx;
3005 /* Then check the current node set has the node LAST_NODE. */
3006 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3012 /* Helper functions for check_arrival. */
3014 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3016 TODO: This function is similar to the functions transit_state*(),
3017 however this function has many additional works.
3018 Can't we unify them? */
3020 static reg_errcode_t
3022 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3023 re_node_set *cur_nodes,
3024 re_node_set *next_nodes)
3026 re_dfa_t *const dfa = mctx->dfa;
3030 re_node_set union_set;
3031 re_node_set_init_empty (&union_set);
3032 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3035 Idx cur_node = cur_nodes->elems[cur_idx];
3037 re_token_type_t type = dfa->nodes[cur_node].type;
3038 assert (!IS_EPSILON_NODE (type));
3040 #ifdef RE_ENABLE_I18N
3041 /* If the node may accept `multi byte'. */
3042 if (dfa->nodes[cur_node].accept_mb)
3044 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3048 re_dfastate_t *dest_state;
3049 Idx next_node = dfa->nexts[cur_node];
3050 Idx next_idx = str_idx + naccepted;
3051 dest_state = mctx->state_log[next_idx];
3052 re_node_set_empty (&union_set);
3055 err = re_node_set_merge (&union_set, &dest_state->nodes);
3056 if (BE (err != REG_NOERROR, 0))
3058 re_node_set_free (&union_set);
3062 ok = re_node_set_insert (&union_set, next_node);
3065 re_node_set_free (&union_set);
3068 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3070 if (BE (mctx->state_log[next_idx] == NULL
3071 && err != REG_NOERROR, 0))
3073 re_node_set_free (&union_set);
3078 #endif /* RE_ENABLE_I18N */
3080 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3082 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3085 re_node_set_free (&union_set);
3090 re_node_set_free (&union_set);
3094 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3095 CUR_NODES, however exclude the nodes which are:
3096 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3097 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3100 static reg_errcode_t
3102 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3103 Idx ex_subexp, int type)
3106 Idx idx, outside_node;
3107 re_node_set new_nodes;
3109 assert (cur_nodes->nelem);
3111 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3112 if (BE (err != REG_NOERROR, 0))
3114 /* Create a new node set NEW_NODES with the nodes which are epsilon
3115 closures of the node in CUR_NODES. */
3117 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3119 Idx cur_node = cur_nodes->elems[idx];
3120 re_node_set *eclosure = dfa->eclosures + cur_node;
3121 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3122 if (outside_node == REG_MISSING)
3124 /* There are no problematic nodes, just merge them. */
3125 err = re_node_set_merge (&new_nodes, eclosure);
3126 if (BE (err != REG_NOERROR, 0))
3128 re_node_set_free (&new_nodes);
3134 /* There are problematic nodes, re-calculate incrementally. */
3135 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3137 if (BE (err != REG_NOERROR, 0))
3139 re_node_set_free (&new_nodes);
3144 re_node_set_free (cur_nodes);
3145 *cur_nodes = new_nodes;
3149 /* Helper function for check_arrival_expand_ecl.
3150 Check incrementally the epsilon closure of TARGET, and if it isn't
3151 problematic append it to DST_NODES. */
3153 static reg_errcode_t
3155 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3156 Idx target, Idx ex_subexp, int type)
3159 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3163 if (dfa->nodes[cur_node].type == type
3164 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3166 if (type == OP_CLOSE_SUBEXP)
3168 ok = re_node_set_insert (dst_nodes, cur_node);
3174 ok = re_node_set_insert (dst_nodes, cur_node);
3177 if (dfa->edests[cur_node].nelem == 0)
3179 if (dfa->edests[cur_node].nelem == 2)
3182 check_arrival_expand_ecl_sub (dfa, dst_nodes,
3183 dfa->edests[cur_node].elems[1],
3185 if (BE (ret != REG_NOERROR, 0))
3188 cur_node = dfa->edests[cur_node].elems[0];
3194 /* For all the back references in the current state, calculate the
3195 destination of the back references by the appropriate entry
3196 in MCTX->BKREF_ENTS. */
3198 static reg_errcode_t
3200 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3201 Idx cur_str, Idx subexp_num, int type)
3203 re_dfa_t *const dfa = mctx->dfa;
3205 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3206 struct re_backref_cache_entry *ent;
3208 if (cache_idx_start == REG_MISSING)
3212 ent = mctx->bkref_ents + cache_idx_start;
3215 Idx to_idx, next_node;
3217 /* Is this entry ENT is appropriate? */
3218 if (!re_node_set_contains (cur_nodes, ent->node))
3221 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3222 /* Calculate the destination of the back reference, and append it
3223 to MCTX->STATE_LOG. */
3224 if (to_idx == cur_str)
3226 /* The backreference did epsilon transit, we must re-check all the
3227 node in the current state. */
3228 re_node_set new_dests;
3229 reg_errcode_t err2, err3;
3230 next_node = dfa->edests[ent->node].elems[0];
3231 if (re_node_set_contains (cur_nodes, next_node))
3233 err = re_node_set_init_1 (&new_dests, next_node);
3234 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3235 err3 = re_node_set_merge (cur_nodes, &new_dests);
3236 re_node_set_free (&new_dests);
3237 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3238 || err3 != REG_NOERROR, 0))
3240 err = (err != REG_NOERROR ? err
3241 : (err2 != REG_NOERROR ? err2 : err3));
3244 /* TODO: It is still inefficient... */
3249 re_node_set union_set;
3250 next_node = dfa->nexts[ent->node];
3251 if (mctx->state_log[to_idx])
3254 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3257 err = re_node_set_init_copy (&union_set,
3258 &mctx->state_log[to_idx]->nodes);
3259 ok = re_node_set_insert (&union_set, next_node);
3260 if (BE (err != REG_NOERROR || ! ok, 0))
3262 re_node_set_free (&union_set);
3263 err = err != REG_NOERROR ? err : REG_ESPACE;
3269 err = re_node_set_init_1 (&union_set, next_node);
3270 if (BE (err != REG_NOERROR, 0))
3273 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3274 re_node_set_free (&union_set);
3275 if (BE (mctx->state_log[to_idx] == NULL
3276 && err != REG_NOERROR, 0))
3280 while (ent++->more);
3284 /* Build transition table for the state.
3285 Return true if successful. */
3289 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3294 bool need_word_trtable = false;
3295 unsigned int elem, mask;
3296 bool dests_node_malloced = false, dest_states_malloced = false;
3297 Idx ndests; /* Number of the destination states from `state'. */
3298 re_dfastate_t **trtable;
3299 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3300 re_node_set follows, *dests_node;
3304 /* We build DFA states which corresponds to the destination nodes
3305 from `state'. `dests_node[i]' represents the nodes which i-th
3306 destination state contains, and `dests_ch[i]' represents the
3307 characters which i-th destination state accepts. */
3308 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3309 dests_node = (re_node_set *)
3310 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3313 dests_node = (re_node_set *)
3314 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3315 if (BE (dests_node == NULL, 0))
3317 dests_node_malloced = true;
3319 dests_ch = (bitset *) (dests_node + SBC_MAX);
3321 /* Initialize transiton table. */
3322 state->word_trtable = state->trtable = NULL;
3324 /* At first, group all nodes belonging to `state' into several
3326 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3327 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3329 if (dests_node_malloced)
3333 state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3339 err = re_node_set_alloc (&follows, ndests + 1);
3340 if (BE (err != REG_NOERROR, 0))
3343 /* Avoid arithmetic overflow in size calculation. */
3344 if (BE (((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)
3345 / (3 * sizeof (re_dfastate_t *)))
3349 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3350 + ndests * 3 * sizeof (re_dfastate_t *)))
3351 dest_states = (re_dfastate_t **)
3352 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3355 dest_states = (re_dfastate_t **)
3356 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3357 if (BE (dest_states == NULL, 0))
3360 if (dest_states_malloced)
3362 re_node_set_free (&follows);
3363 for (i = 0; i < ndests; ++i)
3364 re_node_set_free (dests_node + i);
3365 if (dests_node_malloced)
3369 dest_states_malloced = true;
3371 dest_states_word = dest_states + ndests;
3372 dest_states_nl = dest_states_word + ndests;
3373 bitset_empty (acceptable);
3375 /* Then build the states for all destinations. */
3376 for (i = 0; i < ndests; ++i)
3379 re_node_set_empty (&follows);
3380 /* Merge the follows of this destination states. */
3381 for (j = 0; j < dests_node[i].nelem; ++j)
3383 next_node = dfa->nexts[dests_node[i].elems[j]];
3384 if (next_node != REG_MISSING)
3386 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3387 if (BE (err != REG_NOERROR, 0))
3391 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3392 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3394 /* If the new state has context constraint,
3395 build appropriate states for these contexts. */
3396 if (dest_states[i]->has_constraint)
3398 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3400 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3403 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3404 need_word_trtable = true;
3406 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3408 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3413 dest_states_word[i] = dest_states[i];
3414 dest_states_nl[i] = dest_states[i];
3416 bitset_merge (acceptable, dests_ch[i]);
3419 if (!BE (need_word_trtable, 0))
3421 /* We don't care about whether the following character is a word
3422 character, or we are in a single-byte character set so we can
3423 discern by looking at the character code: allocate a
3424 256-entry transition table. */
3425 trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3426 if (BE (trtable == NULL, 0))
3429 /* For all characters ch...: */
3430 for (i = 0; i < BITSET_UINTS; ++i)
3431 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3433 mask <<= 1, elem >>= 1, ++ch)
3434 if (BE (elem & 1, 0))
3436 /* There must be exactly one destination which accepts
3437 character ch. See group_nodes_into_DFAstates. */
3438 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3441 /* j-th destination accepts the word character ch. */
3442 if (dfa->word_char[i] & mask)
3443 trtable[ch] = dest_states_word[j];
3445 trtable[ch] = dest_states[j];
3450 /* We care about whether the following character is a word
3451 character, and we are in a multi-byte character set: discern
3452 by looking at the character code: build two 256-entry
3453 transition tables, one starting at trtable[0] and one
3454 starting at trtable[SBC_MAX]. */
3455 trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3456 if (BE (trtable == NULL, 0))
3459 /* For all characters ch...: */
3460 for (i = 0; i < BITSET_UINTS; ++i)
3461 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3463 mask <<= 1, elem >>= 1, ++ch)
3464 if (BE (elem & 1, 0))
3466 /* There must be exactly one destination which accepts
3467 character ch. See group_nodes_into_DFAstates. */
3468 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3471 /* j-th destination accepts the word character ch. */
3472 trtable[ch] = dest_states[j];
3473 trtable[ch + SBC_MAX] = dest_states_word[j];
3478 if (bitset_contain (acceptable, NEWLINE_CHAR))
3480 /* The current state accepts newline character. */
3481 for (j = 0; j < ndests; ++j)
3482 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3484 /* k-th destination accepts newline character. */
3485 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3486 if (need_word_trtable)
3487 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3488 /* There must be only one destination which accepts
3489 newline. See group_nodes_into_DFAstates. */
3494 if (dest_states_malloced)
3497 re_node_set_free (&follows);
3498 for (i = 0; i < ndests; ++i)
3499 re_node_set_free (dests_node + i);
3501 if (dests_node_malloced)
3507 /* Group all nodes belonging to STATE into several destinations.
3508 Then for all destinations, set the nodes belonging to the destination
3509 to DESTS_NODE[i] and set the characters accepted by the destination
3510 to DEST_CH[i]. This function return the number of destinations. */
3514 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3515 re_node_set *dests_node, bitset *dests_ch)
3520 Idx ndests; /* Number of the destinations from `state'. */
3521 bitset accepts; /* Characters a node can accept. */
3522 const re_node_set *cur_nodes = &state->nodes;
3523 bitset_empty (accepts);
3526 /* For all the nodes belonging to `state', */
3527 for (i = 0; i < cur_nodes->nelem; ++i)
3529 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3530 re_token_type_t type = node->type;
3531 unsigned int constraint = node->constraint;
3533 /* Enumerate all single byte character this node can accept. */
3534 if (type == CHARACTER)
3535 bitset_set (accepts, node->opr.c);
3536 else if (type == SIMPLE_BRACKET)
3538 bitset_merge (accepts, node->opr.sbcset);
3540 else if (type == OP_PERIOD)
3542 #ifdef RE_ENABLE_I18N
3543 if (dfa->mb_cur_max > 1)
3544 bitset_merge (accepts, dfa->sb_char);
3547 bitset_set_all (accepts);
3548 if (!(dfa->syntax & REG_DOT_NEWLINE))
3549 bitset_clear (accepts, '\n');
3550 if (dfa->syntax & REG_DOT_NOT_NULL)
3551 bitset_clear (accepts, '\0');
3553 #ifdef RE_ENABLE_I18N
3554 else if (type == OP_UTF8_PERIOD)
3556 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3557 if (!(dfa->syntax & REG_DOT_NEWLINE))
3558 bitset_clear (accepts, '\n');
3559 if (dfa->syntax & REG_DOT_NOT_NULL)
3560 bitset_clear (accepts, '\0');
3566 /* Check the `accepts' and sift the characters which are not
3567 match it the context. */
3570 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3572 unsigned int accepts_newline =
3573 bitset_contain (accepts, NEWLINE_CHAR);
3574 bitset_empty (accepts);
3575 if (accepts_newline)
3576 bitset_set (accepts, NEWLINE_CHAR);
3580 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3582 bitset_empty (accepts);
3586 if (constraint & NEXT_WORD_CONSTRAINT)
3588 unsigned int any_set = 0;
3589 if (type == CHARACTER && !node->word_char)
3591 bitset_empty (accepts);
3594 #ifdef RE_ENABLE_I18N
3595 if (dfa->mb_cur_max > 1)
3596 for (j = 0; j < BITSET_UINTS; ++j)
3597 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3600 for (j = 0; j < BITSET_UINTS; ++j)
3601 any_set |= (accepts[j] &= dfa->word_char[j]);
3605 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3607 unsigned int any_set = 0;
3608 if (type == CHARACTER && node->word_char)
3610 bitset_empty (accepts);
3613 #ifdef RE_ENABLE_I18N
3614 if (dfa->mb_cur_max > 1)
3615 for (j = 0; j < BITSET_UINTS; ++j)
3616 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3619 for (j = 0; j < BITSET_UINTS; ++j)
3620 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3626 /* Then divide `accepts' into DFA states, or create a new
3627 state. Above, we make sure that accepts is not empty. */
3628 for (j = 0; j < ndests; ++j)
3630 bitset intersec; /* Intersection sets, see below. */
3632 /* Flags, see below. */
3633 unsigned int has_intersec, not_subset, not_consumed;
3635 /* Optimization, skip if this state doesn't accept the character. */
3636 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3639 /* Enumerate the intersection set of this state and `accepts'. */
3641 for (k = 0; k < BITSET_UINTS; ++k)
3642 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3643 /* And skip if the intersection set is empty. */
3647 /* Then check if this state is a subset of `accepts'. */
3648 not_subset = not_consumed = 0;
3649 for (k = 0; k < BITSET_UINTS; ++k)
3651 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3652 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3655 /* If this state isn't a subset of `accepts', create a
3656 new group state, which has the `remains'. */
3659 bitset_copy (dests_ch[ndests], remains);
3660 bitset_copy (dests_ch[j], intersec);
3661 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3662 if (BE (err != REG_NOERROR, 0))
3667 /* Put the position in the current group. */
3668 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3672 /* If all characters are consumed, go to next node. */
3676 /* Some characters remain, create a new group. */
3679 bitset_copy (dests_ch[ndests], accepts);
3680 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3681 if (BE (err != REG_NOERROR, 0))
3684 bitset_empty (accepts);
3689 for (j = 0; j < ndests; ++j)
3690 re_node_set_free (dests_node + j);
3694 #ifdef RE_ENABLE_I18N
3695 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3696 Return the number of the bytes the node accepts.
3697 STR_IDX is the current index of the input string.
3699 This function handles the nodes which can accept one character, or
3700 one collating element like '.', '[a-z]', opposite to the other nodes
3701 can only accept one byte. */
3705 check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3706 const re_string_t *input, Idx str_idx)
3708 const re_token_t *node = dfa->nodes + node_idx;
3709 int char_len, elem_len;
3712 if (BE (node->type == OP_UTF8_PERIOD, 0))
3714 unsigned char c = re_string_byte_at (input, str_idx), d;
3715 if (BE (c < 0xc2, 1))
3718 if (str_idx + 2 > input->len)
3721 d = re_string_byte_at (input, str_idx + 1);
3723 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3727 if (c == 0xe0 && d < 0xa0)
3733 if (c == 0xf0 && d < 0x90)
3739 if (c == 0xf8 && d < 0x88)
3745 if (c == 0xfc && d < 0x84)
3751 if (str_idx + char_len > input->len)
3754 for (i = 1; i < char_len; ++i)
3756 d = re_string_byte_at (input, str_idx + i);
3757 if (d < 0x80 || d > 0xbf)
3763 char_len = re_string_char_size_at (input, str_idx);
3764 if (node->type == OP_PERIOD)
3768 /* FIXME: I don't think this if is needed, as both '\n'
3769 and '\0' are char_len == 1. */
3770 /* '.' accepts any one character except the following two cases. */
3771 if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3772 re_string_byte_at (input, str_idx) == '\n') ||
3773 ((dfa->syntax & REG_DOT_NOT_NULL) &&
3774 re_string_byte_at (input, str_idx) == '\0'))
3779 elem_len = re_string_elem_size_at (input, str_idx);
3780 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3783 if (node->type == COMPLEX_BRACKET)
3785 const re_charset_t *cset = node->opr.mbcset;
3787 const unsigned char *pin
3788 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3793 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3794 ? re_string_wchar_at (input, str_idx) : 0);
3796 /* match with multibyte character? */
3797 for (i = 0; i < cset->nmbchars; ++i)
3798 if (wc == cset->mbchars[i])
3800 match_len = char_len;
3801 goto check_node_accept_bytes_match;
3803 /* match with character_class? */
3804 for (i = 0; i < cset->nchar_classes; ++i)
3806 wctype_t wt = cset->char_classes[i];
3807 if (__iswctype (wc, wt))
3809 match_len = char_len;
3810 goto check_node_accept_bytes_match;
3815 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3818 unsigned int in_collseq = 0;
3819 const int32_t *table, *indirect;
3820 const unsigned char *weights, *extra;
3821 const char *collseqwc;
3823 /* This #include defines a local function! */
3824 # include <locale/weight.h>
3826 /* match with collating_symbol? */
3827 if (cset->ncoll_syms)
3828 extra = (const unsigned char *)
3829 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3830 for (i = 0; i < cset->ncoll_syms; ++i)
3832 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3833 /* Compare the length of input collating element and
3834 the length of current collating element. */
3835 if (*coll_sym != elem_len)
3837 /* Compare each bytes. */
3838 for (j = 0; j < *coll_sym; j++)
3839 if (pin[j] != coll_sym[1 + j])
3843 /* Match if every bytes is equal. */
3845 goto check_node_accept_bytes_match;
3851 if (elem_len <= char_len)
3853 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3854 in_collseq = __collseq_table_lookup (collseqwc, wc);
3857 in_collseq = find_collation_sequence_value (pin, elem_len);
3859 /* match with range expression? */
3860 for (i = 0; i < cset->nranges; ++i)
3861 if (cset->range_starts[i] <= in_collseq
3862 && in_collseq <= cset->range_ends[i])
3864 match_len = elem_len;
3865 goto check_node_accept_bytes_match;
3868 /* match with equivalence_class? */
3869 if (cset->nequiv_classes)
3871 const unsigned char *cp = pin;
3872 table = (const int32_t *)
3873 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3874 weights = (const unsigned char *)
3875 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3876 extra = (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3878 indirect = (const int32_t *)
3879 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3880 idx = findidx (&cp);
3882 for (i = 0; i < cset->nequiv_classes; ++i)
3884 int32_t equiv_class_idx = cset->equiv_classes[i];
3885 size_t weight_len = weights[idx];
3886 if (weight_len == weights[equiv_class_idx])
3889 while (cnt <= weight_len
3890 && (weights[equiv_class_idx + 1 + cnt]
3891 == weights[idx + 1 + cnt]))
3893 if (cnt > weight_len)
3895 match_len = elem_len;
3896 goto check_node_accept_bytes_match;
3905 /* match with range expression? */
3907 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3909 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3912 for (i = 0; i < cset->nranges; ++i)
3914 cmp_buf[0] = cset->range_starts[i];
3915 cmp_buf[4] = cset->range_ends[i];
3916 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3917 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3919 match_len = char_len;
3920 goto check_node_accept_bytes_match;
3924 check_node_accept_bytes_match:
3925 if (!cset->non_match)
3932 return (elem_len > char_len) ? elem_len : char_len;
3940 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3942 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3947 /* No valid character. Match it as a single byte character. */
3948 const unsigned char *collseq = (const unsigned char *)
3949 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3950 return collseq[mbs[0]];
3957 const unsigned char *extra = (const unsigned char *)
3958 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3959 int32_t extrasize = (const unsigned char *)
3960 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3962 for (idx = 0; idx < extrasize;)
3966 int32_t elem_mbs_len;
3967 /* Skip the name of collating element name. */
3968 idx = idx + extra[idx] + 1;
3969 elem_mbs_len = extra[idx++];
3970 if (mbs_len == elem_mbs_len)
3972 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3973 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3975 if (mbs_cnt == elem_mbs_len)
3976 /* Found the entry. */
3979 /* Skip the byte sequence of the collating element. */
3980 idx += elem_mbs_len;
3981 /* Adjust for the alignment. */
3982 idx = (idx + 3) & ~3;
3983 /* Skip the collation sequence value. */
3984 idx += sizeof (uint32_t);
3985 /* Skip the wide char sequence of the collating element. */
3986 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3987 /* If we found the entry, return the sequence value. */
3989 return *(uint32_t *) (extra + idx);
3990 /* Skip the collation sequence value. */
3991 idx += sizeof (uint32_t);
3997 #endif /* RE_ENABLE_I18N */
3999 /* Check whether the node accepts the byte which is IDX-th
4000 byte of the INPUT. */
4004 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4008 ch = re_string_byte_at (&mctx->input, idx);
4012 if (node->opr.c != ch)
4016 case SIMPLE_BRACKET:
4017 if (!bitset_contain (node->opr.sbcset, ch))
4021 #ifdef RE_ENABLE_I18N
4022 case OP_UTF8_PERIOD:
4028 if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4029 || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4037 if (node->constraint)
4039 /* The node has constraints. Check whether the current context
4040 satisfies the constraints. */
4041 unsigned int context = re_string_context_at (&mctx->input, idx,
4043 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4050 /* Extend the buffers, if the buffers have run out. */
4052 static reg_errcode_t
4054 extend_buffers (re_match_context_t *mctx)
4057 re_string_t *pstr = &mctx->input;
4059 /* Double the lengthes of the buffers. */
4060 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4061 if (BE (ret != REG_NOERROR, 0))
4064 if (mctx->state_log != NULL)
4066 /* And double the length of state_log. */
4067 /* XXX We have no indication of the size of this buffer. If this
4068 allocation fail we have no indication that the state_log array
4069 does not have the right size. */
4070 re_dfastate_t **new_array = re_xrealloc (mctx->state_log, re_dfastate_t *,
4071 pstr->bufs_len + 1);
4072 if (BE (new_array == NULL, 0))
4074 mctx->state_log = new_array;
4077 /* Then reconstruct the buffers. */
4080 #ifdef RE_ENABLE_I18N
4081 if (pstr->mb_cur_max > 1)
4083 ret = build_wcs_upper_buffer (pstr);
4084 if (BE (ret != REG_NOERROR, 0))
4088 #endif /* RE_ENABLE_I18N */
4089 build_upper_buffer (pstr);
4093 #ifdef RE_ENABLE_I18N
4094 if (pstr->mb_cur_max > 1)
4095 build_wcs_buffer (pstr);
4097 #endif /* RE_ENABLE_I18N */
4099 if (pstr->trans != NULL)
4100 re_string_translate_buffer (pstr);
4107 /* Functions for matching context. */
4109 /* Initialize MCTX. */
4111 static reg_errcode_t
4113 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4115 mctx->eflags = eflags;
4116 mctx->match_last = REG_MISSING;
4119 mctx->bkref_ents = re_xmalloc (struct re_backref_cache_entry, n);
4120 mctx->sub_tops = re_xmalloc (re_sub_match_top_t *, n);
4121 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4124 /* Already zero-ed by the caller.
4126 mctx->bkref_ents = NULL;
4127 mctx->nbkref_ents = 0;
4128 mctx->nsub_tops = 0; */
4129 mctx->abkref_ents = n;
4130 mctx->max_mb_elem_len = 1;
4131 mctx->asub_tops = n;
4135 /* Clean the entries which depend on the current input in MCTX.
4136 This function must be invoked when the matcher changes the start index
4137 of the input, or changes the input string. */
4141 match_ctx_clean (re_match_context_t *mctx)
4144 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4147 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4148 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4150 re_sub_match_last_t *last = top->lasts[sl_idx];
4151 re_free (last->path.array);
4154 re_free (top->lasts);
4157 re_free (top->path->array);
4158 re_free (top->path);
4163 mctx->nsub_tops = 0;
4164 mctx->nbkref_ents = 0;
4167 /* Free all the memory associated with MCTX. */
4171 match_ctx_free (re_match_context_t *mctx)
4173 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4174 match_ctx_clean (mctx);
4175 re_free (mctx->sub_tops);
4176 re_free (mctx->bkref_ents);
4179 /* Add a new backreference entry to MCTX.
4180 Note that we assume that caller never call this function with duplicate
4181 entry, and call with STR_IDX which isn't smaller than any existing entry.
4184 static reg_errcode_t
4186 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4189 if (mctx->nbkref_ents >= mctx->abkref_ents)
4191 struct re_backref_cache_entry* new_entry;
4192 new_entry = re_x2realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4193 &mctx->abkref_ents);
4194 if (BE (new_entry == NULL, 0))
4196 re_free (mctx->bkref_ents);
4199 mctx->bkref_ents = new_entry;
4200 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4201 (sizeof (struct re_backref_cache_entry)
4202 * (mctx->abkref_ents - mctx->nbkref_ents)));
4204 if (mctx->nbkref_ents > 0
4205 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4206 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4208 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4209 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4210 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4211 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4213 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4214 If bit N is clear, means that this entry won't epsilon-transition to
4215 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4216 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4219 A backreference does not epsilon-transition unless it is empty, so set
4220 to all zeros if FROM != TO. */
4221 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4222 = (from == to ? -1 : 0);
4224 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4225 if (mctx->max_mb_elem_len < to - from)
4226 mctx->max_mb_elem_len = to - from;
4230 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4231 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4235 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4237 Idx left, right, mid, last;
4238 last = right = mctx->nbkref_ents;
4239 for (left = 0; left < right;)
4241 mid = (left + right) / 2;
4242 if (mctx->bkref_ents[mid].str_idx < str_idx)
4247 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4253 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4256 static reg_errcode_t
4258 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4261 assert (mctx->sub_tops != NULL);
4262 assert (mctx->asub_tops > 0);
4264 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4266 Idx new_asub_tops = mctx->asub_tops;
4267 re_sub_match_top_t **new_array = re_x2realloc (mctx->sub_tops,
4268 re_sub_match_top_t *,
4270 if (BE (new_array == NULL, 0))
4272 mctx->sub_tops = new_array;
4273 mctx->asub_tops = new_asub_tops;
4275 mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4276 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4278 mctx->sub_tops[mctx->nsub_tops]->node = node;
4279 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4283 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4284 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4286 static re_sub_match_last_t *
4288 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4290 re_sub_match_last_t *new_entry;
4291 if (BE (subtop->nlasts == subtop->alasts, 0))
4293 Idx new_alasts = subtop->alasts;
4294 re_sub_match_last_t **new_array = re_x2realloc (subtop->lasts,
4295 re_sub_match_last_t *,
4297 if (BE (new_array == NULL, 0))
4299 subtop->lasts = new_array;
4300 subtop->alasts = new_alasts;
4302 new_entry = re_calloc (re_sub_match_last_t, 1);
4303 if (BE (new_entry != NULL, 1))
4305 subtop->lasts[subtop->nlasts] = new_entry;
4306 new_entry->node = node;
4307 new_entry->str_idx = str_idx;
4315 sift_ctx_init (re_sift_context_t *sctx,
4316 re_dfastate_t **sifted_sts,
4317 re_dfastate_t **limited_sts,
4318 Idx last_node, Idx last_str_idx)
4320 sctx->sifted_states = sifted_sts;
4321 sctx->limited_states = limited_sts;
4322 sctx->last_node = last_node;
4323 sctx->last_str_idx = last_str_idx;
4324 re_node_set_init_empty (&sctx->limits);