lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first)
  58      internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes) internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  74
  75 #ifdef RE_ENABLE_I18N
  76 static int sift_states_iter_mb (const re_match_context_t *mctx,
  77                                 re_sift_context_t *sctx,
  78                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  79 #endif /* RE_ENABLE_I18N */
  80 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  81                                            re_sift_context_t *sctx) internal_function;
  82 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  83                                           re_sift_context_t *sctx, int str_idx,
  84                                           re_node_set *cur_dest) internal_function;
  85 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  86                                               re_sift_context_t *sctx,
  87                                               int str_idx,
  88                                               re_node_set *dest_nodes) internal_function;
  89 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
  90                                             re_node_set *dest_nodes,
  91                                             const re_node_set *candidates) internal_function;
  92 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
  93                              int dst_node, int dst_idx, int src_node,
  94                              int src_idx) internal_function;
  95 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
  96                                         int boundaries, int subexp_idx,
  97                                         int from_node, int bkref_idx) internal_function;
  98 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
  99                                       int limit, int subexp_idx,
 100                                       int node, int str_idx,
 101                                       int bkref_idx) internal_function;
 102 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 103                                           re_node_set *dest_nodes,
 104                                           const re_node_set *candidates,
 105                                           re_node_set *limits,
 106                                           struct re_backref_cache_entry *bkref_ents,
 107                                           int str_idx) internal_function;
 108 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 109                                         re_sift_context_t *sctx,
 110                                         int str_idx, const re_node_set *candidates) internal_function;
 111 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 112                                         re_dfastate_t **src, int num) internal_function;
 113 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 114                                          re_match_context_t *mctx) internal_function;
 115 static re_dfastate_t *transit_state (reg_errcode_t *err,
 116                                      re_match_context_t *mctx,
 117                                      re_dfastate_t *state) internal_function;
 118 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 119                                             re_match_context_t *mctx,
 120                                             re_dfastate_t *next_state) internal_function;
 121 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 122                                                 re_node_set *cur_nodes,
 123                                                 int str_idx) internal_function;
 124 #if 0
 125 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 126                                         re_match_context_t *mctx,
 127                                         re_dfastate_t *pstate) internal_function;
 128 #endif
 129 #ifdef RE_ENABLE_I18N
 130 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 131                                        re_dfastate_t *pstate) internal_function;
 132 #endif /* RE_ENABLE_I18N */
 133 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 134                                           const re_node_set *nodes) internal_function;
 135 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 136                                  int bkref_node, int bkref_str_idx) internal_function;
 137 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 138                                      const re_sub_match_top_t *sub_top,
 139                                      re_sub_match_last_t *sub_last,
 140                                      int bkref_node, int bkref_str) internal_function;
 141 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 142                              int subexp_idx, int type) internal_function;
 143 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 144                                     state_array_t *path, int top_node,
 145                                     int top_str, int last_node, int last_str,
 146                                     int type) internal_function;
 147 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 148                                                    int str_idx,
 149                                                    re_node_set *cur_nodes,
 150                                                    re_node_set *next_nodes) internal_function;
 151 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 152                                                re_node_set *cur_nodes,
 153                                                int ex_subexp, int type) internal_function;
 154 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 155                                                    re_node_set *dst_nodes,
 156                                                    int target, int ex_subexp,
 157                                                    int type) internal_function;
 158 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 159                                          re_node_set *cur_nodes, int cur_str,
 160                                          int subexp_num, int type) internal_function;
 161 static int build_trtable (re_dfa_t *dfa,
 162                           re_dfastate_t *state) internal_function;
 163 #ifdef RE_ENABLE_I18N
 164 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 165                                     const re_string_t *input, int idx) internal_function;
 166 # ifdef _LIBC
 167 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 168                                                    size_t name_len) internal_function;
 169 # endif /* _LIBC */
 170 #endif /* RE_ENABLE_I18N */
 171 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 172                                        const re_dfastate_t *state,
 173                                        re_node_set *states_node,
 174                                        bitset *states_ch) internal_function;
 175 static int check_node_accept (const re_match_context_t *mctx,
 176                               const re_token_t *node, int idx) internal_function;
 177 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 178 \f
 179 /* Entry point for POSIX code.  */
 180
 181 /* regexec searches for a given pattern, specified by PREG, in the
 182    string STRING.
 183
 184    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 185    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 186    least NMATCH elements, and we set them to the offsets of the
 187    corresponding matched substrings.
 188
 189    EFLAGS specifies `execution flags' which affect matching: if
 190    REG_NOTBOL is set, then ^ does not match at the beginning of the
 191    string; if REG_NOTEOL is set, then $ does not match at the end.
 192
 193    We return 0 if we find a match and REG_NOMATCH if not.  */
 194
 195 int
 196 regexec (const regex_t *__restrict preg, const char *__restrict string,
 197          size_t nmatch, regmatch_t pmatch[], int eflags)
 198 {
 199   reg_errcode_t err;
 200   int start, length;
 201 #ifdef _LIBC
 202   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 203 #endif
 204
 205   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 206     return REG_BADPAT;
 207
 208   if (eflags & REG_STARTEND)
 209     {
 210       start = pmatch[0].rm_so;
 211       length = pmatch[0].rm_eo;
 212     }
 213   else
 214     {
 215       start = 0;
 216       length = strlen (string);
 217     }
 218
 219   __libc_lock_lock (dfa->lock);
 220   if (preg->re_no_sub)
 221     err = re_search_internal (preg, string, length, start, length - start,
 222                               length, 0, NULL, eflags);
 223   else
 224     err = re_search_internal (preg, string, length, start, length - start,
 225                               length, nmatch, pmatch, eflags);
 226   __libc_lock_unlock (dfa->lock);
 227   return err != REG_NOERROR;
 228 }
 229
 230 #ifdef _LIBC
 231 # include <shlib-compat.h>
 232 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 233
 234 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 235 __typeof__ (__regexec) __compat_regexec;
 236
 237 int
 238 attribute_compat_text_section
 239 __compat_regexec (const regex_t *__restrict preg,
 240                   const char *__restrict string, size_t nmatch,
 241                   regmatch_t pmatch[], int eflags)
 242 {
 243   return regexec (preg, string, nmatch, pmatch,
 244                   eflags & (REG_NOTBOL | REG_NOTEOL));
 245 }
 246 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 247 # endif
 248 #endif
 249
 250 /* Entry points for GNU code.  */
 251
 252 /* re_match, re_search, re_match_2, re_search_2
 253
 254    The former two functions operate on STRING with length LENGTH,
 255    while the later two operate on concatenation of STRING1 and STRING2
 256    with lengths LENGTH1 and LENGTH2, respectively.
 257
 258    re_match() matches the compiled pattern in BUFP against the string,
 259    starting at index START.
 260
 261    re_search() first tries matching at index START, then it tries to match
 262    starting from index START + 1, and so on.  The last start position tried
 263    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 264    way as re_match().)
 265
 266    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 267    the first STOP characters of the concatenation of the strings should be
 268    concerned.
 269
 270    If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
 271    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 272    computed relative to the concatenation, not relative to the individual
 273    strings.)
 274
 275    On success, re_match* functions return the length of the match, re_search*
 276    return the position of the start of the match.  Return value -1 means no
 277    match was found and -2 indicates an internal error.  */
 278
 279 int
 280 re_match (struct re_pattern_buffer *bufp, const char *string,
 281           int length, int start, struct re_registers *regs)
 282 {
 283   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_match, re_match)
 287 #endif
 288
 289 int
 290 re_search (struct re_pattern_buffer *bufp, const char *string,
 291            int length, int start, int range, struct re_registers *regs)
 292 {
 293   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 294 }
 295 #ifdef _LIBC
 296 weak_alias (__re_search, re_search)
 297 #endif
 298
 299 int
 300 re_match_2 (struct re_pattern_buffer *bufp,
 301             const char *string1, int length1,
 302             const char *string2, int length2,
 303             int start, struct re_registers *regs, int stop)
 304 {
 305   return re_search_2_stub (bufp, string1, length1, string2, length2,
 306                            start, 0, regs, stop, 1);
 307 }
 308 #ifdef _LIBC
 309 weak_alias (__re_match_2, re_match_2)
 310 #endif
 311
 312 int
 313 re_search_2 (struct re_pattern_buffer *bufp,
 314              const char *string1, int length1,
 315              const char *string2, int length2,
 316              int start, int range, struct re_registers *regs, int stop)
 317 {
 318   return re_search_2_stub (bufp, string1, length1, string2, length2,
 319                            start, range, regs, stop, 0);
 320 }
 321 #ifdef _LIBC
 322 weak_alias (__re_search_2, re_search_2)
 323 #endif
 324
 325 static int
 326 internal_function
 327 re_search_2_stub (struct re_pattern_buffer *bufp,
 328                   const char *string1, int length1,
 329                   const char *string2, int length2,
 330                   int start, int range, struct re_registers *regs, int stop,
 331                   int ret_len)
 332 {
 333   const char *str;
 334   int rval;
 335   int len = length1 + length2;
 336   int free_str = 0;
 337
 338   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 339     return -2;
 340
 341   /* Concatenate the strings.  */
 342   if (length2 > 0)
 343     if (length1 > 0)
 344       {
 345         char *s = re_malloc (char, len);
 346
 347         if (BE (s == NULL, 0))
 348           return -2;
 349         memcpy (s, string1, length1);
 350         memcpy (s + length1, string2, length2);
 351         str = s;
 352         free_str = 1;
 353       }
 354     else
 355       str = string2;
 356   else
 357     str = string1;
 358
 359   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 360                          ret_len);
 361   if (free_str)
 362     re_free ((char *) str);
 363   return rval;
 364 }
 365
 366 /* The parameters have the same meaning as those of re_search.
 367    Additional parameters:
 368    If RET_LEN is nonzero the length of the match is returned (re_match style);
 369    otherwise the position of the match is returned.  */
 370
 371 static int
 372 internal_function
 373 re_search_stub (struct re_pattern_buffer *bufp,
 374                 const char *string, int length,
 375                 int start, int range, int stop, struct re_registers *regs,
 376                 int ret_len)
 377 {
 378   reg_errcode_t result;
 379   regmatch_t *pmatch;
 380   int nregs, rval;
 381   int eflags = 0;
 382 #ifdef _LIBC
 383   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 384 #endif
 385
 386   /* Check for out-of-range.  */
 387   if (BE (start < 0 || start > length, 0))
 388     return -1;
 389   if (BE (start + range > length, 0))
 390     range = length - start;
 391   else if (BE (start + range < 0, 0))
 392     range = -start;
 393
 394   __libc_lock_lock (dfa->lock);
 395
 396   eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
 397   eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
 398
 399   /* Compile fastmap if we haven't yet.  */
 400   if (range > 0 && bufp->re_fastmap != NULL && !bufp->re_fastmap_accurate)
 401     re_compile_fastmap (bufp);
 402
 403   if (BE (bufp->re_no_sub, 0))
 404     regs = NULL;
 405
 406   /* We need at least 1 register.  */
 407   if (regs == NULL)
 408     nregs = 1;
 409   else if (BE (bufp->re_regs_allocated == REG_FIXED
 410                && regs->rm_num_regs < bufp->re_nsub + 1, 0))
 411     {
 412       nregs = regs->rm_num_regs;
 413       if (BE (nregs < 1, 0))
 414         {
 415           /* Nothing can be copied to regs.  */
 416           regs = NULL;
 417           nregs = 1;
 418         }
 419     }
 420   else
 421     nregs = bufp->re_nsub + 1;
 422   pmatch = re_malloc (regmatch_t, nregs);
 423   if (BE (pmatch == NULL, 0))
 424     {
 425       rval = -2;
 426       goto out;
 427     }
 428
 429   result = re_search_internal (bufp, string, length, start, range, stop,
 430                                nregs, pmatch, eflags);
 431
 432   rval = 0;
 433
 434   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 435   if (result != REG_NOERROR)
 436     rval = -1;
 437   else if (regs != NULL)
 438     {
 439       /* If caller wants register contents data back, copy them.  */
 440       bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
 441                                               bufp->re_regs_allocated);
 442       if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
 443         rval = -2;
 444     }
 445
 446   if (BE (rval == 0, 1))
 447     {
 448       if (ret_len)
 449         {
 450           assert (pmatch[0].rm_so == start);
 451           rval = pmatch[0].rm_eo - start;
 452         }
 453       else
 454         rval = pmatch[0].rm_so;
 455     }
 456   re_free (pmatch);
 457  out:
 458   __libc_lock_unlock (dfa->lock);
 459   return rval;
 460 }
 461
 462 static unsigned
 463 internal_function
 464 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
 465               int regs_allocated)
 466 {
 467   int rval = REG_REALLOCATE;
 468   int i;
 469   int need_regs = nregs + 1;
 470   /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
 471      uses.  */
 472
 473   /* Have the register data arrays been allocated?  */
 474   if (regs_allocated == REG_UNALLOCATED)
 475     { /* No.  So allocate them with malloc.  */
 476       regs->rm_start = re_malloc (regoff_t, need_regs);
 477       regs->rm_end = re_malloc (regoff_t, need_regs);
 478       if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
 479         return REG_UNALLOCATED;
 480       regs->rm_num_regs = need_regs;
 481     }
 482   else if (regs_allocated == REG_REALLOCATE)
 483     { /* Yes.  If we need more elements than were already
 484          allocated, reallocate them.  If we need fewer, just
 485          leave it alone.  */
 486       if (BE (need_regs > regs->rm_num_regs, 0))
 487         {
 488           regoff_t *new_start =
 489             re_realloc (regs->rm_start, regoff_t, need_regs);
 490           regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
 491           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 492             return REG_UNALLOCATED;
 493           regs->rm_start = new_start;
 494           regs->rm_end = new_end;
 495           regs->rm_num_regs = need_regs;
 496         }
 497     }
 498   else
 499     {
 500       assert (regs_allocated == REG_FIXED);
 501       /* This function may not be called with REG_FIXED and nregs too big.  */
 502       assert (regs->rm_num_regs >= nregs);
 503       rval = REG_FIXED;
 504     }
 505
 506   /* Copy the regs.  */
 507   for (i = 0; i < nregs; ++i)
 508     {
 509       regs->rm_start[i] = pmatch[i].rm_so;
 510       regs->rm_end[i] = pmatch[i].rm_eo;
 511     }
 512   for ( ; i < regs->rm_num_regs; ++i)
 513     regs->rm_start[i] = regs->rm_end[i] = -1;
 514
 515   return rval;
 516 }
 517
 518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 519    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 520    this memory for recording register information.  STARTS and ENDS
 521    must be allocated using the malloc library routine, and must each
 522    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 523
 524    If NUM_REGS == 0, then subsequent matches should allocate their own
 525    register data.
 526
 527    Unless this function is called, the first search or match using
 528    PATTERN_BUFFER will allocate its own register data, without
 529    freeing the old data.  */
 530
 531 void
 532 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 533                   unsigned int num_regs, regoff_t *starts, regoff_t *ends)
 534 {
 535   if (num_regs)
 536     {
 537       bufp->re_regs_allocated = REG_REALLOCATE;
 538       regs->rm_num_regs = num_regs;
 539       regs->rm_start = starts;
 540       regs->rm_end = ends;
 541     }
 542   else
 543     {
 544       bufp->re_regs_allocated = REG_UNALLOCATED;
 545       regs->rm_num_regs = 0;
 546       regs->rm_start = regs->rm_end = NULL;
 547     }
 548 }
 549 #ifdef _LIBC
 550 weak_alias (__re_set_registers, re_set_registers)
 551 #endif
 552 \f
 553 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 554    them unless specifically requested.  */
 555
 556 #if defined _REGEX_RE_COMP || defined _LIBC
 557 int
 558 # ifdef _LIBC
 559 weak_function
 560 # endif
 561 re_exec (s)
 562      const char *s;
 563 {
 564   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 565 }
 566 #endif /* _REGEX_RE_COMP */
 567 \f
 568 /* Internal entry point.  */
 569
 570 /* Searches for a compiled pattern PREG in the string STRING, whose
 571    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 572    mingings with regexec.  START, and RANGE have the same meanings
 573    with re_search.
 574    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 575    otherwise return the error code.
 576    Note: We assume front end functions already check ranges.
 577    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 578
 579 static reg_errcode_t
 580 internal_function
 581 re_search_internal (const regex_t *preg,
 582                     const char *string, int length,
 583                     int start, int range, int stop,
 584                     size_t nmatch, regmatch_t pmatch[],
 585                     int eflags)
 586 {
 587   reg_errcode_t err;
 588   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 589   int left_lim, right_lim, incr;
 590   int fl_longest_match, match_first, match_kind, match_last = -1;
 591   int extra_nmatch;
 592   int sb, ch;
 593 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 594   re_match_context_t mctx = { .dfa = dfa };
 595 #else
 596   re_match_context_t mctx;
 597 #endif
 598   char *fastmap = (preg->re_fastmap != NULL && preg->re_fastmap_accurate
 599                    && range && !preg->re_can_be_null) ? preg->re_fastmap : NULL;
 600   unsigned REG_TRANSLATE_TYPE t =
 601     (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
 602
 603 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 604   memset (&mctx, '\0', sizeof (re_match_context_t));
 605   mctx.dfa = dfa;
 606 #endif
 607
 608   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 609   nmatch -= extra_nmatch;
 610
 611   /* Check if the DFA haven't been compiled.  */
 612   if (BE (preg->re_used == 0 || dfa->init_state == NULL
 613           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 614           || dfa->init_state_begbuf == NULL, 0))
 615     return REG_NOMATCH;
 616
 617 #ifdef DEBUG
 618   /* We assume front-end functions already check them.  */
 619   assert (start + range >= 0 && start + range <= length);
 620 #endif
 621
 622   /* If initial states with non-begbuf contexts have no elements,
 623      the regex must be anchored.  If preg->re_newline_anchor is set,
 624      we'll never use init_state_nl, so do not check it.  */
 625   if (dfa->init_state->nodes.nelem == 0
 626       && dfa->init_state_word->nodes.nelem == 0
 627       && (dfa->init_state_nl->nodes.nelem == 0
 628           || !preg->re_newline_anchor))
 629     {
 630       if (start != 0 && start + range != 0)
 631         return REG_NOMATCH;
 632       start = range = 0;
 633     }
 634
 635   /* We must check the longest matching, if nmatch > 0.  */
 636   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 637
 638   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 639                             preg->re_translate,
 640                             preg->re_syntax & REG_IGNORE_CASE, dfa);
 641   if (BE (err != REG_NOERROR, 0))
 642     goto free_return;
 643   mctx.input.stop = stop;
 644   mctx.input.raw_stop = stop;
 645   mctx.input.newline_anchor = preg->re_newline_anchor;
 646
 647   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 648   if (BE (err != REG_NOERROR, 0))
 649     goto free_return;
 650
 651   /* We will log all the DFA states through which the dfa pass,
 652      if nmatch > 1, or this dfa has "multibyte node", which is a
 653      back-reference or a node which can accept multibyte character or
 654      multi character collating element.  */
 655   if (nmatch > 1 || dfa->has_mb_node)
 656     {
 657       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 658       if (BE (mctx.state_log == NULL, 0))
 659         {
 660           err = REG_ESPACE;
 661           goto free_return;
 662         }
 663     }
 664   else
 665     mctx.state_log = NULL;
 666
 667   match_first = start;
 668   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 669                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 670
 671   /* Check incrementally whether of not the input string match.  */
 672   incr = (range < 0) ? -1 : 1;
 673   left_lim = (range < 0) ? start + range : start;
 674   right_lim = (range < 0) ? start : start + range;
 675   sb = dfa->mb_cur_max == 1;
 676   match_kind =
 677     (fastmap
 678      ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
 679         | (range >= 0 ? 2 : 0)
 680         | (t != NULL ? 1 : 0))
 681      : 8);
 682
 683   for (;; match_first += incr)
 684     {
 685       err = REG_NOMATCH;
 686       if (match_first < left_lim || right_lim < match_first)
 687         goto free_return;
 688
 689       /* Advance as rapidly as possible through the string, until we
 690          find a plausible place to start matching.  This may be done
 691          with varying efficiency, so there are various possibilities:
 692          only the most common of them are specialized, in order to
 693          save on code size.  We use a switch statement for speed.  */
 694       switch (match_kind)
 695         {
 696         case 8:
 697           /* No fastmap.  */
 698           break;
 699
 700         case 7:
 701           /* Fastmap with single-byte translation, match forward.  */
 702           while (BE (match_first < right_lim, 1)
 703                  && !fastmap[t[(unsigned char) string[match_first]]])
 704             ++match_first;
 705           goto forward_match_found_start_or_reached_end;
 706
 707         case 6:
 708           /* Fastmap without translation, match forward.  */
 709           while (BE (match_first < right_lim, 1)
 710                  && !fastmap[(unsigned char) string[match_first]])
 711             ++match_first;
 712
 713         forward_match_found_start_or_reached_end:
 714           if (BE (match_first == right_lim, 0))
 715             {
 716               ch = match_first >= length
 717                        ? 0 : (unsigned char) string[match_first];
 718               if (!fastmap[t ? t[ch] : ch])
 719                 goto free_return;
 720             }
 721           break;
 722
 723         case 4:
 724         case 5:
 725           /* Fastmap without multi-byte translation, match backwards.  */
 726           while (match_first >= left_lim)
 727             {
 728               ch = match_first >= length
 729                        ? 0 : (unsigned char) string[match_first];
 730               if (fastmap[t ? t[ch] : ch])
 731                 break;
 732               --match_first;
 733             }
 734           if (match_first < left_lim)
 735             goto free_return;
 736           break;
 737
 738         default:
 739           /* In this case, we can't determine easily the current byte,
 740              since it might be a component byte of a multibyte
 741              character.  Then we use the constructed buffer instead.  */
 742           for (;;)
 743             {
 744               /* If MATCH_FIRST is out of the valid range, reconstruct the
 745                  buffers.  */
 746               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 747               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 748                 {
 749                   err = re_string_reconstruct (&mctx.input, match_first,
 750                                                eflags);
 751                   if (BE (err != REG_NOERROR, 0))
 752                     goto free_return;
 753
 754                   offset = match_first - mctx.input.raw_mbs_idx;
 755                 }
 756               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 757                  Note that MATCH_FIRST must not be smaller than 0.  */
 758               ch = (match_first >= length
 759                     ? 0 : re_string_byte_at (&mctx.input, offset));
 760               if (fastmap[ch])
 761                 break;
 762               match_first += incr;
 763               if (match_first < left_lim || match_first > right_lim)
 764                 {
 765                   err = REG_NOMATCH;
 766                   goto free_return;
 767                 }
 768             }
 769           break;
 770         }
 771
 772       /* Reconstruct the buffers so that the matcher can assume that
 773          the matching starts from the beginning of the buffer.  */
 774       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 775       if (BE (err != REG_NOERROR, 0))
 776         goto free_return;
 777
 778 #ifdef RE_ENABLE_I18N
 779      /* Don't consider this char as a possible match start if it part,
 780         yet isn't the head, of a multibyte character.  */
 781       if (!sb && !re_string_first_byte (&mctx.input, 0))
 782         continue;
 783 #endif
 784
 785       /* It seems to be appropriate one, then use the matcher.  */
 786       /* We assume that the matching starts from 0.  */
 787       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 788       match_last = check_matching (&mctx, fl_longest_match,
 789                                    range >= 0 ? &match_first : NULL);
 790       if (match_last != -1)
 791         {
 792           if (BE (match_last == -2, 0))
 793             {
 794               err = REG_ESPACE;
 795               goto free_return;
 796             }
 797           else
 798             {
 799               mctx.match_last = match_last;
 800               if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
 801                 {
 802                   re_dfastate_t *pstate = mctx.state_log[match_last];
 803                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 804                                                              match_last);
 805                 }
 806               if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
 807                   || dfa->nbackref)
 808                 {
 809                   err = prune_impossible_nodes (&mctx);
 810                   if (err == REG_NOERROR)
 811                     break;
 812                   if (BE (err != REG_NOMATCH, 0))
 813                     goto free_return;
 814                   match_last = -1;
 815                 }
 816               else
 817                 break; /* We found a match.  */
 818             }
 819         }
 820
 821       match_ctx_clean (&mctx);
 822     }
 823
 824 #ifdef DEBUG
 825   assert (match_last != -1);
 826   assert (err == REG_NOERROR);
 827 #endif
 828
 829   /* Set pmatch[] if we need.  */
 830   if (nmatch > 0)
 831     {
 832       int reg_idx;
 833
 834       /* Initialize registers.  */
 835       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 836         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 837
 838       /* Set the points where matching start/end.  */
 839       pmatch[0].rm_so = 0;
 840       pmatch[0].rm_eo = mctx.match_last;
 841
 842       if (!preg->re_no_sub && nmatch > 1)
 843         {
 844           err = set_regs (preg, &mctx, nmatch, pmatch,
 845                           dfa->has_plural_match && dfa->nbackref > 0);
 846           if (BE (err != REG_NOERROR, 0))
 847             goto free_return;
 848         }
 849
 850       /* At last, add the offset to the each registers, since we slided
 851          the buffers so that we could assume that the matching starts
 852          from 0.  */
 853       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 854         if (pmatch[reg_idx].rm_so != -1)
 855           {
 856 #ifdef RE_ENABLE_I18N
 857             if (BE (mctx.input.offsets_needed != 0, 0))
 858               {
 859                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 860                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 861                 else
 862                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 863                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 864                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 865                 else
 866                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 867               }
 868 #else
 869             assert (mctx.input.offsets_needed == 0);
 870 #endif
 871             pmatch[reg_idx].rm_so += match_first;
 872             pmatch[reg_idx].rm_eo += match_first;
 873           }
 874       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 875         {
 876           pmatch[nmatch + reg_idx].rm_so = -1;
 877           pmatch[nmatch + reg_idx].rm_eo = -1;
 878         }
 879
 880       if (dfa->subexp_map)
 881         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 882           if (dfa->subexp_map[reg_idx] != reg_idx)
 883             {
 884               pmatch[reg_idx + 1].rm_so
 885                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 886               pmatch[reg_idx + 1].rm_eo
 887                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 888             }
 889     }
 890
 891  free_return:
 892   re_free (mctx.state_log);
 893   if (dfa->nbackref)
 894     match_ctx_free (&mctx);
 895   re_string_destruct (&mctx.input);
 896   return err;
 897 }
 898
 899 static reg_errcode_t
 900 internal_function
 901 prune_impossible_nodes (re_match_context_t *mctx)
 902 {
 903   re_dfa_t *const dfa = mctx->dfa;
 904   int halt_node, match_last;
 905   reg_errcode_t ret;
 906   re_dfastate_t **sifted_states;
 907   re_dfastate_t **lim_states = NULL;
 908   re_sift_context_t sctx;
 909 #ifdef DEBUG
 910   assert (mctx->state_log != NULL);
 911 #endif
 912   match_last = mctx->match_last;
 913   halt_node = mctx->last_node;
 914   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 915   if (BE (sifted_states == NULL, 0))
 916     {
 917       ret = REG_ESPACE;
 918       goto free_return;
 919     }
 920   if (dfa->nbackref)
 921     {
 922       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 923       if (BE (lim_states == NULL, 0))
 924         {
 925           ret = REG_ESPACE;
 926           goto free_return;
 927         }
 928       while (1)
 929         {
 930           memset (lim_states, '\0',
 931                   sizeof (re_dfastate_t *) * (match_last + 1));
 932           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 933                          match_last);
 934           ret = sift_states_backward (mctx, &sctx);
 935           re_node_set_free (&sctx.limits);
 936           if (BE (ret != REG_NOERROR, 0))
 937               goto free_return;
 938           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 939             break;
 940           do
 941             {
 942               --match_last;
 943               if (match_last < 0)
 944                 {
 945                   ret = REG_NOMATCH;
 946                   goto free_return;
 947                 }
 948             } while (mctx->state_log[match_last] == NULL
 949                      || !mctx->state_log[match_last]->halt);
 950           halt_node = check_halt_state_context (mctx,
 951                                                 mctx->state_log[match_last],
 952                                                 match_last);
 953         }
 954       ret = merge_state_array (dfa, sifted_states, lim_states,
 955                                match_last + 1);
 956       re_free (lim_states);
 957       lim_states = NULL;
 958       if (BE (ret != REG_NOERROR, 0))
 959         goto free_return;
 960     }
 961   else
 962     {
 963       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 964       ret = sift_states_backward (mctx, &sctx);
 965       re_node_set_free (&sctx.limits);
 966       if (BE (ret != REG_NOERROR, 0))
 967         goto free_return;
 968     }
 969   re_free (mctx->state_log);
 970   mctx->state_log = sifted_states;
 971   sifted_states = NULL;
 972   mctx->last_node = halt_node;
 973   mctx->match_last = match_last;
 974   ret = REG_NOERROR;
 975  free_return:
 976   re_free (sifted_states);
 977   re_free (lim_states);
 978   return ret;
 979 }
 980
 981 /* Acquire an initial state and return it.
 982    We must select appropriate initial state depending on the context,
 983    since initial states may have constraints like "\<", "^", etc..  */
 984
 985 static inline re_dfastate_t *
 986 __attribute ((always_inline)) internal_function
 987 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
 988                             int idx)
 989 {
 990   re_dfa_t *const dfa = mctx->dfa;
 991   if (dfa->init_state->has_constraint)
 992     {
 993       unsigned int context;
 994       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 995       if (IS_WORD_CONTEXT (context))
 996         return dfa->init_state_word;
 997       else if (IS_ORDINARY_CONTEXT (context))
 998         return dfa->init_state;
 999       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1000         return dfa->init_state_begbuf;
1001       else if (IS_NEWLINE_CONTEXT (context))
1002         return dfa->init_state_nl;
1003       else if (IS_BEGBUF_CONTEXT (context))
1004         {
1005           /* It is relatively rare case, then calculate on demand.  */
1006           return re_acquire_state_context (err, dfa,
1007                                            dfa->init_state->entrance_nodes,
1008                                            context);
1009         }
1010       else
1011         /* Must not happen?  */
1012         return dfa->init_state;
1013     }
1014   else
1015     return dfa->init_state;
1016 }
1017
1018 /* Check whether the regular expression match input string INPUT or not,
1019    and return the index where the matching end, return -1 if not match,
1020    or return -2 in case of an error.
1021    FL_LONGEST_MATCH means we want the POSIX longest matching.
1022    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1023    next place where we may want to try matching.
1024    Note that the matcher assume that the maching starts from the current
1025    index of the buffer.  */
1026
1027 static int
1028 internal_function
1029 check_matching (re_match_context_t *mctx, int fl_longest_match,
1030                 int *p_match_first)
1031 {
1032   re_dfa_t *const dfa = mctx->dfa;
1033   reg_errcode_t err;
1034   int match = 0;
1035   int match_last = -1;
1036   int cur_str_idx = re_string_cur_idx (&mctx->input);
1037   re_dfastate_t *cur_state;
1038   int at_init_state = p_match_first != NULL;
1039   int next_start_idx = cur_str_idx;
1040
1041   err = REG_NOERROR;
1042   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1043   /* An initial state must not be NULL (invalid).  */
1044   if (BE (cur_state == NULL, 0))
1045     {
1046       assert (err == REG_ESPACE);
1047       return -2;
1048     }
1049
1050   if (mctx->state_log != NULL)
1051     {
1052       mctx->state_log[cur_str_idx] = cur_state;
1053
1054       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1055          later.  E.g. Processing back references.  */
1056       if (BE (dfa->nbackref, 0))
1057         {
1058           at_init_state = 0;
1059           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1060           if (BE (err != REG_NOERROR, 0))
1061             return err;
1062
1063           if (cur_state->has_backref)
1064             {
1065               err = transit_state_bkref (mctx, &cur_state->nodes);
1066               if (BE (err != REG_NOERROR, 0))
1067                 return err;
1068             }
1069         }
1070     }
1071
1072   /* If the RE accepts NULL string.  */
1073   if (BE (cur_state->halt, 0))
1074     {
1075       if (!cur_state->has_constraint
1076           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1077         {
1078           if (!fl_longest_match)
1079             return cur_str_idx;
1080           else
1081             {
1082               match_last = cur_str_idx;
1083               match = 1;
1084             }
1085         }
1086     }
1087
1088   while (!re_string_eoi (&mctx->input))
1089     {
1090       re_dfastate_t *old_state = cur_state;
1091       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1092
1093       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1094           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1095               && mctx->input.valid_len < mctx->input.len))
1096         {
1097           err = extend_buffers (mctx);
1098           if (BE (err != REG_NOERROR, 0))
1099             {
1100               assert (err == REG_ESPACE);
1101               return -2;
1102             }
1103         }
1104
1105       cur_state = transit_state (&err, mctx, cur_state);
1106       if (mctx->state_log != NULL)
1107         cur_state = merge_state_with_log (&err, mctx, cur_state);
1108
1109       if (cur_state == NULL)
1110         {
1111           /* Reached the invalid state or an error.  Try to recover a valid
1112              state using the state log, if available and if we have not
1113              already found a valid (even if not the longest) match.  */
1114           if (BE (err != REG_NOERROR, 0))
1115             return -2;
1116
1117           if (mctx->state_log == NULL
1118               || (match && !fl_longest_match)
1119               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1120             break;
1121         }
1122
1123       if (BE (at_init_state, 0))
1124         {
1125           if (old_state == cur_state)
1126             next_start_idx = next_char_idx;
1127           else
1128             at_init_state = 0;
1129         }
1130
1131       if (cur_state->halt)
1132         {
1133           /* Reached a halt state.
1134              Check the halt state can satisfy the current context.  */
1135           if (!cur_state->has_constraint
1136               || check_halt_state_context (mctx, cur_state,
1137                                            re_string_cur_idx (&mctx->input)))
1138             {
1139               /* We found an appropriate halt state.  */
1140               match_last = re_string_cur_idx (&mctx->input);
1141               match = 1;
1142
1143               /* We found a match, do not modify match_first below.  */
1144               p_match_first = NULL;
1145               if (!fl_longest_match)
1146                 break;
1147             }
1148         }
1149     }
1150
1151   if (p_match_first)
1152     *p_match_first += next_start_idx;
1153
1154   return match_last;
1155 }
1156
1157 /* Check NODE match the current context.  */
1158
1159 static int
1160 internal_function
1161 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1162 {
1163   re_token_type_t type = dfa->nodes[node].type;
1164   unsigned int constraint = dfa->nodes[node].constraint;
1165   if (type != END_OF_RE)
1166     return 0;
1167   if (!constraint)
1168     return 1;
1169   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1170     return 0;
1171   return 1;
1172 }
1173
1174 /* Check the halt state STATE match the current context.
1175    Return 0 if not match, if the node, STATE has, is a halt node and
1176    match the context, return the node.  */
1177
1178 static int
1179 internal_function
1180 check_halt_state_context (const re_match_context_t *mctx,
1181                           const re_dfastate_t *state, int idx)
1182 {
1183   int i;
1184   unsigned int context;
1185 #ifdef DEBUG
1186   assert (state->halt);
1187 #endif
1188   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1189   for (i = 0; i < state->nodes.nelem; ++i)
1190     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1191       return state->nodes.elems[i];
1192   return 0;
1193 }
1194
1195 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1196    corresponding to the DFA).
1197    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1198    of errors.  */
1199
1200 static int
1201 internal_function
1202 proceed_next_node (const re_match_context_t *mctx,
1203                    int nregs, regmatch_t *regs, int *pidx, int node,
1204                    re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1205 {
1206   re_dfa_t *const dfa = mctx->dfa;
1207   int i, err;
1208   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1209     {
1210       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1211       re_node_set *edests = &dfa->edests[node];
1212       int dest_node;
1213       err = re_node_set_insert (eps_via_nodes, node);
1214       if (BE (err < 0, 0))
1215         return -2;
1216       /* Pick up a valid destination, or return -1 if none is found.  */
1217       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1218         {
1219           int candidate = edests->elems[i];
1220           if (!re_node_set_contains (cur_nodes, candidate))
1221             continue;
1222           if (dest_node == -1)
1223             dest_node = candidate;
1224
1225           else
1226             {
1227               /* In order to avoid infinite loop like "(a*)*", return the second
1228                  epsilon-transition if the first was already considered.  */
1229               if (re_node_set_contains (eps_via_nodes, dest_node))
1230                 return candidate;
1231
1232               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1233               else if (fs != NULL
1234                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1235                                            eps_via_nodes))
1236                 return -2;
1237
1238               /* We know we are going to exit.  */
1239               break;
1240             }
1241         }
1242       return dest_node;
1243     }
1244   else
1245     {
1246       int naccepted = 0;
1247       re_token_type_t type = dfa->nodes[node].type;
1248
1249 #ifdef RE_ENABLE_I18N
1250       if (dfa->nodes[node].accept_mb)
1251         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1252       else
1253 #endif /* RE_ENABLE_I18N */
1254       if (type == OP_BACK_REF)
1255         {
1256           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1257           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1258           if (fs != NULL)
1259             {
1260               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1261                 return -1;
1262               else if (naccepted)
1263                 {
1264                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1265                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1266                               naccepted) != 0)
1267                     return -1;
1268                 }
1269             }
1270
1271           if (naccepted == 0)
1272             {
1273               int dest_node;
1274               err = re_node_set_insert (eps_via_nodes, node);
1275               if (BE (err < 0, 0))
1276                 return -2;
1277               dest_node = dfa->edests[node].elems[0];
1278               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1279                                         dest_node))
1280                 return dest_node;
1281             }
1282         }
1283
1284       if (naccepted != 0
1285           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1286         {
1287           int dest_node = dfa->nexts[node];
1288           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1289           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1290                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1291                                                dest_node)))
1292             return -1;
1293           re_node_set_empty (eps_via_nodes);
1294           return dest_node;
1295         }
1296     }
1297   return -1;
1298 }
1299
1300 static reg_errcode_t
1301 internal_function
1302 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1303                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1304 {
1305   reg_errcode_t err;
1306   int num = fs->num++;
1307   if (fs->num == fs->alloc)
1308     {
1309       struct re_fail_stack_ent_t *new_array =
1310         re_realloc (fs->stack, struct re_fail_stack_ent_t, fs->alloc * 2);
1311       if (new_array == NULL)
1312         return REG_ESPACE;
1313       fs->alloc *= 2;
1314       fs->stack = new_array;
1315     }
1316   fs->stack[num].idx = str_idx;
1317   fs->stack[num].node = dest_node;
1318   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1319   if (fs->stack[num].regs == NULL)
1320     return REG_ESPACE;
1321   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1322   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1323   return err;
1324 }
1325
1326 static int
1327 internal_function
1328 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx,
1329                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1330 {
1331   int num = --fs->num;
1332   assert (num >= 0);
1333   *pidx = fs->stack[num].idx;
1334   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1335   re_node_set_free (eps_via_nodes);
1336   re_free (fs->stack[num].regs);
1337   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1338   return fs->stack[num].node;
1339 }
1340
1341 /* Set the positions where the subexpressions are starts/ends to registers
1342    PMATCH.
1343    Note: We assume that pmatch[0] is already set, and
1344    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1345
1346 static reg_errcode_t
1347 internal_function
1348 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1349           size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1350 {
1351   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1352   int idx, cur_node;
1353   re_node_set eps_via_nodes;
1354   struct re_fail_stack_t *fs;
1355   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1356   regmatch_t *prev_idx_match;
1357   int prev_idx_match_malloced = 0;
1358
1359 #ifdef DEBUG
1360   assert (nmatch > 1);
1361   assert (mctx->state_log != NULL);
1362 #endif
1363   if (fl_backtrack)
1364     {
1365       fs = &fs_body;
1366       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1367       if (fs->stack == NULL)
1368         return REG_ESPACE;
1369     }
1370   else
1371     fs = NULL;
1372
1373   cur_node = dfa->init_node;
1374   re_node_set_init_empty (&eps_via_nodes);
1375
1376   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1377     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1378   else
1379     {
1380       prev_idx_match = re_malloc (regmatch_t, nmatch);
1381       if (prev_idx_match == NULL)
1382         {
1383           free_fail_stack_return (fs);
1384           return REG_ESPACE;
1385         }
1386       prev_idx_match_malloced = 1;
1387     }
1388   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1389
1390   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1391     {
1392       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1393
1394       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1395         {
1396           int reg_idx;
1397           if (fs)
1398             {
1399               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1400                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1401                   break;
1402               if (reg_idx == nmatch)
1403                 {
1404                   re_node_set_free (&eps_via_nodes);
1405                   if (prev_idx_match_malloced)
1406                     re_free (prev_idx_match);
1407                   return free_fail_stack_return (fs);
1408                 }
1409               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1410                                          &eps_via_nodes);
1411             }
1412           else
1413             {
1414               re_node_set_free (&eps_via_nodes);
1415               if (prev_idx_match_malloced)
1416                 re_free (prev_idx_match);
1417               return REG_NOERROR;
1418             }
1419         }
1420
1421       /* Proceed to next node.  */
1422       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1423                                     &eps_via_nodes, fs);
1424
1425       if (BE (cur_node < 0, 0))
1426         {
1427           if (BE (cur_node == -2, 0))
1428             {
1429               re_node_set_free (&eps_via_nodes);
1430               if (prev_idx_match_malloced)
1431                 re_free (prev_idx_match);
1432               free_fail_stack_return (fs);
1433               return REG_ESPACE;
1434             }
1435           if (fs)
1436             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437                                        &eps_via_nodes);
1438           else
1439             {
1440               re_node_set_free (&eps_via_nodes);
1441               if (prev_idx_match_malloced)
1442                 re_free (prev_idx_match);
1443               return REG_NOMATCH;
1444             }
1445         }
1446     }
1447   re_node_set_free (&eps_via_nodes);
1448   if (prev_idx_match_malloced)
1449     re_free (prev_idx_match);
1450   return free_fail_stack_return (fs);
1451 }
1452
1453 static reg_errcode_t
1454 internal_function
1455 free_fail_stack_return (struct re_fail_stack_t *fs)
1456 {
1457   if (fs)
1458     {
1459       int fs_idx;
1460       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1461         {
1462           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1463           re_free (fs->stack[fs_idx].regs);
1464         }
1465       re_free (fs->stack);
1466     }
1467   return REG_NOERROR;
1468 }
1469
1470 static void
1471 internal_function
1472 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1473              int cur_node, int cur_idx, int nmatch)
1474 {
1475   int type = dfa->nodes[cur_node].type;
1476   if (type == OP_OPEN_SUBEXP)
1477     {
1478       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1479
1480       /* We are at the first node of this sub expression.  */
1481       if (reg_num < nmatch)
1482         {
1483           pmatch[reg_num].rm_so = cur_idx;
1484           pmatch[reg_num].rm_eo = -1;
1485         }
1486     }
1487   else if (type == OP_CLOSE_SUBEXP)
1488     {
1489       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1490       if (reg_num < nmatch)
1491         {
1492           /* We are at the last node of this sub expression.  */
1493           if (pmatch[reg_num].rm_so < cur_idx)
1494             {
1495               pmatch[reg_num].rm_eo = cur_idx;
1496               /* This is a non-empty match or we are not inside an optional
1497                  subexpression.  Accept this right away.  */
1498               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1499             }
1500           else
1501             {
1502               if (dfa->nodes[cur_node].opt_subexp
1503                   && prev_idx_match[reg_num].rm_so != -1)
1504                 /* We transited through an empty match for an optional
1505                    subexpression, like (a?)*, and this is not the subexp's
1506                    first match.  Copy back the old content of the registers
1507                    so that matches of an inner subexpression are undone as
1508                    well, like in ((a?))*.  */
1509                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1510               else
1511                 /* We completed a subexpression, but it may be part of
1512                    an optional one, so do not update PREV_IDX_MATCH.  */
1513                 pmatch[reg_num].rm_eo = cur_idx;
1514             }
1515         }
1516     }
1517 }
1518
1519 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1520    and sift the nodes in each states according to the following rules.
1521    Updated state_log will be wrote to STATE_LOG.
1522
1523    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1524      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1525         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1526         the LAST_NODE, we throw away the node `a'.
1527      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1528         string `s' and transit to `b':
1529         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1530            away the node `a'.
1531         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1532             thrown away, we throw away the node `a'.
1533      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1534         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1535            node `a'.
1536         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1537             we throw away the node `a'.  */
1538
1539 #define STATE_NODE_CONTAINS(state,node) \
1540   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1541
1542 static reg_errcode_t
1543 internal_function
1544 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1545 {
1546   reg_errcode_t err;
1547   int null_cnt = 0;
1548   int str_idx = sctx->last_str_idx;
1549   re_node_set cur_dest;
1550
1551 #ifdef DEBUG
1552   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1553 #endif
1554
1555   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1556      transit to the last_node and the last_node itself.  */
1557   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1558   if (BE (err != REG_NOERROR, 0))
1559     return err;
1560   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1561   if (BE (err != REG_NOERROR, 0))
1562     goto free_return;
1563
1564   /* Then check each states in the state_log.  */
1565   while (str_idx > 0)
1566     {
1567       /* Update counters.  */
1568       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1569       if (null_cnt > mctx->max_mb_elem_len)
1570         {
1571           memset (sctx->sifted_states, '\0',
1572                   sizeof (re_dfastate_t *) * str_idx);
1573           re_node_set_free (&cur_dest);
1574           return REG_NOERROR;
1575         }
1576       re_node_set_empty (&cur_dest);
1577       --str_idx;
1578
1579       if (mctx->state_log[str_idx])
1580         {
1581           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1582           if (BE (err != REG_NOERROR, 0))
1583             goto free_return;
1584         }
1585
1586       /* Add all the nodes which satisfy the following conditions:
1587          - It can epsilon transit to a node in CUR_DEST.
1588          - It is in CUR_SRC.
1589          And update state_log.  */
1590       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1591       if (BE (err != REG_NOERROR, 0))
1592         goto free_return;
1593     }
1594   err = REG_NOERROR;
1595  free_return:
1596   re_node_set_free (&cur_dest);
1597   return err;
1598 }
1599
1600 static reg_errcode_t
1601 internal_function
1602 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1603                      int str_idx, re_node_set *cur_dest)
1604 {
1605   re_dfa_t *const dfa = mctx->dfa;
1606   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1607   int i;
1608
1609   /* Then build the next sifted state.
1610      We build the next sifted state on `cur_dest', and update
1611      `sifted_states[str_idx]' with `cur_dest'.
1612      Note:
1613      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1614      `cur_src' points the node_set of the old `state_log[str_idx]'
1615      (with the epsilon nodes pre-filtered out).  */
1616   for (i = 0; i < cur_src->nelem; i++)
1617     {
1618       int prev_node = cur_src->elems[i];
1619       int naccepted = 0;
1620       int ret;
1621
1622 #ifdef DEBUG
1623       re_token_type_t type = dfa->nodes[prev_node].type;
1624       assert (!IS_EPSILON_NODE (type));
1625 #endif
1626 #ifdef RE_ENABLE_I18N
1627       /* If the node may accept `multi byte'.  */
1628       if (dfa->nodes[prev_node].accept_mb)
1629         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1630                                          str_idx, sctx->last_str_idx);
1631 #endif /* RE_ENABLE_I18N */
1632
1633       /* We don't check backreferences here.
1634          See update_cur_sifted_state().  */
1635       if (!naccepted
1636           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1637           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1638                                   dfa->nexts[prev_node]))
1639         naccepted = 1;
1640
1641       if (naccepted == 0)
1642         continue;
1643
1644       if (sctx->limits.nelem)
1645         {
1646           int to_idx = str_idx + naccepted;
1647           if (check_dst_limits (mctx, &sctx->limits,
1648                                 dfa->nexts[prev_node], to_idx,
1649                                 prev_node, str_idx))
1650             continue;
1651         }
1652       ret = re_node_set_insert (cur_dest, prev_node);
1653       if (BE (ret == -1, 0))
1654         return REG_ESPACE;
1655     }
1656
1657   return REG_NOERROR;
1658 }
1659
1660 /* Helper functions.  */
1661
1662 static reg_errcode_t
1663 internal_function
1664 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1665 {
1666   int top = mctx->state_log_top;
1667
1668   if (next_state_log_idx >= mctx->input.bufs_len
1669       || (next_state_log_idx >= mctx->input.valid_len
1670           && mctx->input.valid_len < mctx->input.len))
1671     {
1672       reg_errcode_t err;
1673       err = extend_buffers (mctx);
1674       if (BE (err != REG_NOERROR, 0))
1675         return err;
1676     }
1677
1678   if (top < next_state_log_idx)
1679     {
1680       memset (mctx->state_log + top + 1, '\0',
1681               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1682       mctx->state_log_top = next_state_log_idx;
1683     }
1684   return REG_NOERROR;
1685 }
1686
1687 static reg_errcode_t
1688 internal_function
1689 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1690                    int num)
1691 {
1692   int st_idx;
1693   reg_errcode_t err;
1694   for (st_idx = 0; st_idx < num; ++st_idx)
1695     {
1696       if (dst[st_idx] == NULL)
1697         dst[st_idx] = src[st_idx];
1698       else if (src[st_idx] != NULL)
1699         {
1700           re_node_set merged_set;
1701           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1702                                         &src[st_idx]->nodes);
1703           if (BE (err != REG_NOERROR, 0))
1704             return err;
1705           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1706           re_node_set_free (&merged_set);
1707           if (BE (err != REG_NOERROR, 0))
1708             return err;
1709         }
1710     }
1711   return REG_NOERROR;
1712 }
1713
1714 static reg_errcode_t
1715 internal_function
1716 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1717                          int str_idx, re_node_set *dest_nodes)
1718 {
1719   re_dfa_t *const dfa = mctx->dfa;
1720   reg_errcode_t err;
1721   const re_node_set *candidates;
1722   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1723                 : &mctx->state_log[str_idx]->nodes);
1724
1725   if (dest_nodes->nelem == 0)
1726     sctx->sifted_states[str_idx] = NULL;
1727   else
1728     {
1729       if (candidates)
1730         {
1731           /* At first, add the nodes which can epsilon transit to a node in
1732              DEST_NODE.  */
1733           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1734           if (BE (err != REG_NOERROR, 0))
1735             return err;
1736
1737           /* Then, check the limitations in the current sift_context.  */
1738           if (sctx->limits.nelem)
1739             {
1740               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1741                                          mctx->bkref_ents, str_idx);
1742               if (BE (err != REG_NOERROR, 0))
1743                 return err;
1744             }
1745         }
1746
1747       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1748       if (BE (err != REG_NOERROR, 0))
1749         return err;
1750     }
1751
1752   if (candidates && mctx->state_log[str_idx]->has_backref)
1753     {
1754       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1755       if (BE (err != REG_NOERROR, 0))
1756         return err;
1757     }
1758   return REG_NOERROR;
1759 }
1760
1761 static reg_errcode_t
1762 internal_function
1763 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1764                        const re_node_set *candidates)
1765 {
1766   reg_errcode_t err = REG_NOERROR;
1767   int i;
1768
1769   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1770   if (BE (err != REG_NOERROR, 0))
1771     return err;
1772
1773   if (!state->inveclosure.alloc)
1774     {
1775       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1776       if (BE (err != REG_NOERROR, 0))
1777         return REG_ESPACE;
1778       for (i = 0; i < dest_nodes->nelem; i++)
1779         re_node_set_merge (&state->inveclosure,
1780                            dfa->inveclosures + dest_nodes->elems[i]);
1781     }
1782   return re_node_set_add_intersect (dest_nodes, candidates,
1783                                     &state->inveclosure);
1784 }
1785
1786 static reg_errcode_t
1787 internal_function
1788 sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1789                        const re_node_set *candidates)
1790 {
1791     int ecl_idx;
1792     reg_errcode_t err;
1793     re_node_set *inv_eclosure = dfa->inveclosures + node;
1794     re_node_set except_nodes;
1795     re_node_set_init_empty (&except_nodes);
1796     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1797       {
1798         int cur_node = inv_eclosure->elems[ecl_idx];
1799         if (cur_node == node)
1800           continue;
1801         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1802           {
1803             int edst1 = dfa->edests[cur_node].elems[0];
1804             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1805                          ? dfa->edests[cur_node].elems[1] : -1);
1806             if ((!re_node_set_contains (inv_eclosure, edst1)
1807                  && re_node_set_contains (dest_nodes, edst1))
1808                 || (edst2 > 0
1809                     && !re_node_set_contains (inv_eclosure, edst2)
1810                     && re_node_set_contains (dest_nodes, edst2)))
1811               {
1812                 err = re_node_set_add_intersect (&except_nodes, candidates,
1813                                                  dfa->inveclosures + cur_node);
1814                 if (BE (err != REG_NOERROR, 0))
1815                   {
1816                     re_node_set_free (&except_nodes);
1817                     return err;
1818                   }
1819               }
1820           }
1821       }
1822     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823       {
1824         int cur_node = inv_eclosure->elems[ecl_idx];
1825         if (!re_node_set_contains (&except_nodes, cur_node))
1826           {
1827             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1828             re_node_set_remove_at (dest_nodes, idx);
1829           }
1830       }
1831     re_node_set_free (&except_nodes);
1832     return REG_NOERROR;
1833 }
1834
1835 static int
1836 internal_function
1837 check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
1838                   int dst_node, int dst_idx, int src_node, int src_idx)
1839 {
1840   re_dfa_t *const dfa = mctx->dfa;
1841   int lim_idx, src_pos, dst_pos;
1842
1843   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1844   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1845   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1846     {
1847       int subexp_idx;
1848       struct re_backref_cache_entry *ent;
1849       ent = mctx->bkref_ents + limits->elems[lim_idx];
1850       subexp_idx = dfa->nodes[ent->node].opr.idx;
1851
1852       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1853                                            subexp_idx, dst_node, dst_idx,
1854                                            dst_bkref_idx);
1855       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1856                                            subexp_idx, src_node, src_idx,
1857                                            src_bkref_idx);
1858
1859       /* In case of:
1860          <src> <dst> ( <subexp> )
1861          ( <subexp> ) <src> <dst>
1862          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1863       if (src_pos == dst_pos)
1864         continue; /* This is unrelated limitation.  */
1865       else
1866         return 1;
1867     }
1868   return 0;
1869 }
1870
1871 static int
1872 internal_function
1873 check_dst_limits_calc_pos_1 (re_match_context_t *mctx, int boundaries,
1874                              int subexp_idx, int from_node, int bkref_idx)
1875 {
1876   re_dfa_t *const dfa = mctx->dfa;
1877   re_node_set *eclosures = dfa->eclosures + from_node;
1878   int node_idx;
1879
1880   /* Else, we are on the boundary: examine the nodes on the epsilon
1881      closure.  */
1882   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1883     {
1884       int node = eclosures->elems[node_idx];
1885       switch (dfa->nodes[node].type)
1886         {
1887         case OP_BACK_REF:
1888           if (bkref_idx != -1)
1889             {
1890               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1891               do
1892                 {
1893                   int dst, cpos;
1894
1895                   if (ent->node != node)
1896                     continue;
1897
1898                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1899                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1900                     continue;
1901
1902                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1903                      OP_CLOSE_SUBEXP cases below.  But, if the
1904                      destination node is the same node as the source
1905                      node, don't recurse because it would cause an
1906                      infinite loop: a regex that exhibits this behavior
1907                      is ()\1*\1*  */
1908                   dst = dfa->edests[node].elems[0];
1909                   if (dst == from_node)
1910                     {
1911                       if (boundaries & 1)
1912                         return -1;
1913                       else /* if (boundaries & 2) */
1914                         return 0;
1915                     }
1916
1917                   cpos =
1918                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1919                                                  dst, bkref_idx);
1920                   if (cpos == -1 /* && (boundaries & 1) */)
1921                     return -1;
1922                   if (cpos == 0 && (boundaries & 2))
1923                     return 0;
1924
1925                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1926                 }
1927               while (ent++->more);
1928             }
1929           break;
1930
1931         case OP_OPEN_SUBEXP:
1932           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1933             return -1;
1934           break;
1935
1936         case OP_CLOSE_SUBEXP:
1937           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1938             return 0;
1939           break;
1940
1941         default:
1942             break;
1943         }
1944     }
1945
1946   return (boundaries & 2) ? 1 : 0;
1947 }
1948
1949 static int
1950 internal_function
1951 check_dst_limits_calc_pos (re_match_context_t *mctx, int limit, int subexp_idx,
1952                            int from_node, int str_idx, int bkref_idx)
1953 {
1954   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1955   int boundaries;
1956
1957   /* If we are outside the range of the subexpression, return -1 or 1.  */
1958   if (str_idx < lim->subexp_from)
1959     return -1;
1960
1961   if (lim->subexp_to < str_idx)
1962     return 1;
1963
1964   /* If we are within the subexpression, return 0.  */
1965   boundaries = (str_idx == lim->subexp_from);
1966   boundaries |= (str_idx == lim->subexp_to) << 1;
1967   if (boundaries == 0)
1968     return 0;
1969
1970   /* Else, examine epsilon closure.  */
1971   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1972                                       from_node, bkref_idx);
1973 }
1974
1975 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1976    which are against limitations from DEST_NODES. */
1977
1978 static reg_errcode_t
1979 internal_function
1980 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
1981                      const re_node_set *candidates, re_node_set *limits,
1982                      struct re_backref_cache_entry *bkref_ents, int str_idx)
1983 {
1984   reg_errcode_t err;
1985   int node_idx, lim_idx;
1986
1987   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1988     {
1989       int subexp_idx;
1990       struct re_backref_cache_entry *ent;
1991       ent = bkref_ents + limits->elems[lim_idx];
1992
1993       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1994         continue; /* This is unrelated limitation.  */
1995
1996       subexp_idx = dfa->nodes[ent->node].opr.idx;
1997       if (ent->subexp_to == str_idx)
1998         {
1999           int ops_node = -1;
2000           int cls_node = -1;
2001           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2002             {
2003               int node = dest_nodes->elems[node_idx];
2004               re_token_type_t type = dfa->nodes[node].type;
2005               if (type == OP_OPEN_SUBEXP
2006                   && subexp_idx == dfa->nodes[node].opr.idx)
2007                 ops_node = node;
2008               else if (type == OP_CLOSE_SUBEXP
2009                        && subexp_idx == dfa->nodes[node].opr.idx)
2010                 cls_node = node;
2011             }
2012
2013           /* Check the limitation of the open subexpression.  */
2014           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2015           if (ops_node >= 0)
2016             {
2017               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2018                                            candidates);
2019               if (BE (err != REG_NOERROR, 0))
2020                 return err;
2021             }
2022
2023           /* Check the limitation of the close subexpression.  */
2024           if (cls_node >= 0)
2025             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2026               {
2027                 int node = dest_nodes->elems[node_idx];
2028                 if (!re_node_set_contains (dfa->inveclosures + node,
2029                                            cls_node)
2030                     && !re_node_set_contains (dfa->eclosures + node,
2031                                               cls_node))
2032                   {
2033                     /* It is against this limitation.
2034                        Remove it form the current sifted state.  */
2035                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2036                                                  candidates);
2037                     if (BE (err != REG_NOERROR, 0))
2038                       return err;
2039                     --node_idx;
2040                   }
2041               }
2042         }
2043       else /* (ent->subexp_to != str_idx)  */
2044         {
2045           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2046             {
2047               int node = dest_nodes->elems[node_idx];
2048               re_token_type_t type = dfa->nodes[node].type;
2049               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2050                 {
2051                   if (subexp_idx != dfa->nodes[node].opr.idx)
2052                     continue;
2053                   /* It is against this limitation.
2054                      Remove it form the current sifted state.  */
2055                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2056                                                candidates);
2057                   if (BE (err != REG_NOERROR, 0))
2058                     return err;
2059                 }
2060             }
2061         }
2062     }
2063   return REG_NOERROR;
2064 }
2065
2066 static reg_errcode_t
2067 internal_function
2068 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2069                    int str_idx, const re_node_set *candidates)
2070 {
2071   re_dfa_t *const dfa = mctx->dfa;
2072   reg_errcode_t err;
2073   int node_idx, node;
2074   re_sift_context_t local_sctx;
2075   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2076
2077   if (first_idx == -1)
2078     return REG_NOERROR;
2079
2080   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2081
2082   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2083     {
2084       int enabled_idx;
2085       re_token_type_t type;
2086       struct re_backref_cache_entry *entry;
2087       node = candidates->elems[node_idx];
2088       type = dfa->nodes[node].type;
2089       /* Avoid infinite loop for the REs like "()\1+".  */
2090       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2091         continue;
2092       if (type != OP_BACK_REF)
2093         continue;
2094
2095       entry = mctx->bkref_ents + first_idx;
2096       enabled_idx = first_idx;
2097       do
2098         {
2099           int subexp_len, to_idx, dst_node, ret;
2100           re_dfastate_t *cur_state;
2101
2102           if (entry->node != node)
2103             continue;
2104           subexp_len = entry->subexp_to - entry->subexp_from;
2105           to_idx = str_idx + subexp_len;
2106           dst_node = (subexp_len ? dfa->nexts[node]
2107                       : dfa->edests[node].elems[0]);
2108
2109           if (to_idx > sctx->last_str_idx
2110               || sctx->sifted_states[to_idx] == NULL
2111               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2112               || check_dst_limits (mctx, &sctx->limits, node,
2113                                    str_idx, dst_node, to_idx))
2114             continue;
2115
2116           if (local_sctx.sifted_states == NULL)
2117             {
2118               local_sctx = *sctx;
2119               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2120               if (BE (err != REG_NOERROR, 0))
2121                 goto free_return;
2122             }
2123           local_sctx.last_node = node;
2124           local_sctx.last_str_idx = str_idx;
2125           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2126           if (BE (ret < 0, 0))
2127             {
2128               err = REG_ESPACE;
2129               goto free_return;
2130             }
2131           cur_state = local_sctx.sifted_states[str_idx];
2132           err = sift_states_backward (mctx, &local_sctx);
2133           if (BE (err != REG_NOERROR, 0))
2134             goto free_return;
2135           if (sctx->limited_states != NULL)
2136             {
2137               err = merge_state_array (dfa, sctx->limited_states,
2138                                        local_sctx.sifted_states,
2139                                        str_idx + 1);
2140               if (BE (err != REG_NOERROR, 0))
2141                 goto free_return;
2142             }
2143           local_sctx.sifted_states[str_idx] = cur_state;
2144           re_node_set_remove (&local_sctx.limits, enabled_idx);
2145
2146           /* mctx->bkref_ents may have changed, reload the pointer.  */
2147           entry = mctx->bkref_ents + enabled_idx;
2148         }
2149       while (enabled_idx++, entry++->more);
2150     }
2151   err = REG_NOERROR;
2152  free_return:
2153   if (local_sctx.sifted_states != NULL)
2154     {
2155       re_node_set_free (&local_sctx.limits);
2156     }
2157
2158   return err;
2159 }
2160
2161
2162 #ifdef RE_ENABLE_I18N
2163 static int
2164 internal_function
2165 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2166                      int node_idx, int str_idx, int max_str_idx)
2167 {
2168   re_dfa_t *const dfa = mctx->dfa;
2169   int naccepted;
2170   /* Check the node can accept `multi byte'.  */
2171   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2172   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2173       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2174                             dfa->nexts[node_idx]))
2175     /* The node can't accept the `multi byte', or the
2176        destination was already thrown away, then the node
2177        could't accept the current input `multi byte'.   */
2178     naccepted = 0;
2179   /* Otherwise, it is sure that the node could accept
2180      `naccepted' bytes input.  */
2181   return naccepted;
2182 }
2183 #endif /* RE_ENABLE_I18N */
2184
2185 \f
2186 /* Functions for state transition.  */
2187
2188 /* Return the next state to which the current state STATE will transit by
2189    accepting the current input byte, and update STATE_LOG if necessary.
2190    If STATE can accept a multibyte char/collating element/back reference
2191    update the destination of STATE_LOG.  */
2192
2193 static re_dfastate_t *
2194 internal_function
2195 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2196                re_dfastate_t *state)
2197 {
2198   re_dfastate_t **trtable;
2199   unsigned char ch;
2200
2201 #ifdef RE_ENABLE_I18N
2202   /* If the current state can accept multibyte.  */
2203   if (BE (state->accept_mb, 0))
2204     {
2205       *err = transit_state_mb (mctx, state);
2206       if (BE (*err != REG_NOERROR, 0))
2207         return NULL;
2208     }
2209 #endif /* RE_ENABLE_I18N */
2210
2211   /* Then decide the next state with the single byte.  */
2212 #if 0
2213   if (0)
2214     /* don't use transition table  */
2215     return transit_state_sb (err, mctx, state);
2216 #endif
2217
2218   /* Use transition table  */
2219   ch = re_string_fetch_byte (&mctx->input);
2220   for (;;)
2221     {
2222       trtable = state->trtable;
2223       if (BE (trtable != NULL, 1))
2224         return trtable[ch];
2225
2226       trtable = state->word_trtable;
2227       if (BE (trtable != NULL, 1))
2228         {
2229           unsigned int context;
2230           context
2231             = re_string_context_at (&mctx->input,
2232                                     re_string_cur_idx (&mctx->input) - 1,
2233                                     mctx->eflags);
2234           if (IS_WORD_CONTEXT (context))
2235             return trtable[ch + SBC_MAX];
2236           else
2237             return trtable[ch];
2238         }
2239
2240       if (!build_trtable (mctx->dfa, state))
2241         {
2242           *err = REG_ESPACE;
2243           return NULL;
2244         }
2245
2246       /* Retry, we now have a transition table.  */
2247     }
2248 }
2249
2250 /* Update the state_log if we need */
2251 re_dfastate_t *
2252 internal_function
2253 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2254                       re_dfastate_t *next_state)
2255 {
2256   re_dfa_t *const dfa = mctx->dfa;
2257   int cur_idx = re_string_cur_idx (&mctx->input);
2258
2259   if (cur_idx > mctx->state_log_top)
2260     {
2261       mctx->state_log[cur_idx] = next_state;
2262       mctx->state_log_top = cur_idx;
2263     }
2264   else if (mctx->state_log[cur_idx] == 0)
2265     {
2266       mctx->state_log[cur_idx] = next_state;
2267     }
2268   else
2269     {
2270       re_dfastate_t *pstate;
2271       unsigned int context;
2272       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2273       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2274          the destination of a multibyte char/collating element/
2275          back reference.  Then the next state is the union set of
2276          these destinations and the results of the transition table.  */
2277       pstate = mctx->state_log[cur_idx];
2278       log_nodes = pstate->entrance_nodes;
2279       if (next_state != NULL)
2280         {
2281           table_nodes = next_state->entrance_nodes;
2282           *err = re_node_set_init_union (&next_nodes, table_nodes,
2283                                              log_nodes);
2284           if (BE (*err != REG_NOERROR, 0))
2285             return NULL;
2286         }
2287       else
2288         next_nodes = *log_nodes;
2289       /* Note: We already add the nodes of the initial state,
2290          then we don't need to add them here.  */
2291
2292       context = re_string_context_at (&mctx->input,
2293                                       re_string_cur_idx (&mctx->input) - 1,
2294                                       mctx->eflags);
2295       next_state = mctx->state_log[cur_idx]
2296         = re_acquire_state_context (err, dfa, &next_nodes, context);
2297       /* We don't need to check errors here, since the return value of
2298          this function is next_state and ERR is already set.  */
2299
2300       if (table_nodes != NULL)
2301         re_node_set_free (&next_nodes);
2302     }
2303
2304   if (BE (dfa->nbackref, 0) && next_state != NULL)
2305     {
2306       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2307          later.  We must check them here, since the back references in the
2308          next state might use them.  */
2309       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2310                                         cur_idx);
2311       if (BE (*err != REG_NOERROR, 0))
2312         return NULL;
2313
2314       /* If the next state has back references.  */
2315       if (next_state->has_backref)
2316         {
2317           *err = transit_state_bkref (mctx, &next_state->nodes);
2318           if (BE (*err != REG_NOERROR, 0))
2319             return NULL;
2320           next_state = mctx->state_log[cur_idx];
2321         }
2322     }
2323
2324   return next_state;
2325 }
2326
2327 /* Skip bytes in the input that correspond to part of a
2328    multi-byte match, then look in the log for a state
2329    from which to restart matching.  */
2330 static re_dfastate_t *
2331 internal_function
2332 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2333 {
2334   re_dfastate_t *cur_state = NULL;
2335   do
2336     {
2337       int max = mctx->state_log_top;
2338       int cur_str_idx = re_string_cur_idx (&mctx->input);
2339
2340       do
2341         {
2342           if (++cur_str_idx > max)
2343             return NULL;
2344           re_string_skip_bytes (&mctx->input, 1);
2345         }
2346       while (mctx->state_log[cur_str_idx] == NULL);
2347
2348       cur_state = merge_state_with_log (err, mctx, NULL);
2349     }
2350   while (*err == REG_NOERROR && cur_state == NULL);
2351   return cur_state;
2352 }
2353
2354 /* Helper functions for transit_state.  */
2355
2356 /* From the node set CUR_NODES, pick up the nodes whose types are
2357    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2358    expression. And register them to use them later for evaluating the
2359    correspoding back references.  */
2360
2361 static reg_errcode_t
2362 internal_function
2363 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2364                            int str_idx)
2365 {
2366   re_dfa_t *const dfa = mctx->dfa;
2367   int node_idx;
2368   reg_errcode_t err;
2369
2370   /* TODO: This isn't efficient.
2371            Because there might be more than one nodes whose types are
2372            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2373            nodes.
2374            E.g. RE: (a){2}  */
2375   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2376     {
2377       int node = cur_nodes->elems[node_idx];
2378       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2379           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2380           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2381         {
2382           err = match_ctx_add_subtop (mctx, node, str_idx);
2383           if (BE (err != REG_NOERROR, 0))
2384             return err;
2385         }
2386     }
2387   return REG_NOERROR;
2388 }
2389
2390 #if 0
2391 /* Return the next state to which the current state STATE will transit by
2392    accepting the current input byte.  */
2393
2394 static re_dfastate_t *
2395 transit_state_sb (err, mctx, state)
2396      reg_errcode_t *err;
2397      re_match_context_t *mctx;
2398      re_dfastate_t *state;
2399 {
2400   re_dfa_t *const dfa = mctx->dfa;
2401   re_node_set next_nodes;
2402   re_dfastate_t *next_state;
2403   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2404   unsigned int context;
2405
2406   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2407   if (BE (*err != REG_NOERROR, 0))
2408     return NULL;
2409   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2410     {
2411       int cur_node = state->nodes.elems[node_cnt];
2412       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2413         {
2414           *err = re_node_set_merge (&next_nodes,
2415                                     dfa->eclosures + dfa->nexts[cur_node]);
2416           if (BE (*err != REG_NOERROR, 0))
2417             {
2418               re_node_set_free (&next_nodes);
2419               return NULL;
2420             }
2421         }
2422     }
2423   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2424   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2425   /* We don't need to check errors here, since the return value of
2426      this function is next_state and ERR is already set.  */
2427
2428   re_node_set_free (&next_nodes);
2429   re_string_skip_bytes (&mctx->input, 1);
2430   return next_state;
2431 }
2432 #endif
2433
2434 #ifdef RE_ENABLE_I18N
2435 static reg_errcode_t
2436 internal_function
2437 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2438 {
2439   re_dfa_t *const dfa = mctx->dfa;
2440   reg_errcode_t err;
2441   int i;
2442
2443   for (i = 0; i < pstate->nodes.nelem; ++i)
2444     {
2445       re_node_set dest_nodes, *new_nodes;
2446       int cur_node_idx = pstate->nodes.elems[i];
2447       int naccepted, dest_idx;
2448       unsigned int context;
2449       re_dfastate_t *dest_state;
2450
2451       if (!dfa->nodes[cur_node_idx].accept_mb)
2452         continue;
2453
2454       if (dfa->nodes[cur_node_idx].constraint)
2455         {
2456           context = re_string_context_at (&mctx->input,
2457                                           re_string_cur_idx (&mctx->input),
2458                                           mctx->eflags);
2459           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2460                                            context))
2461             continue;
2462         }
2463
2464       /* How many bytes the node can accept?  */
2465       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2466                                            re_string_cur_idx (&mctx->input));
2467       if (naccepted == 0)
2468         continue;
2469
2470       /* The node can accepts `naccepted' bytes.  */
2471       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2472       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2473                                : mctx->max_mb_elem_len);
2474       err = clean_state_log_if_needed (mctx, dest_idx);
2475       if (BE (err != REG_NOERROR, 0))
2476         return err;
2477 #ifdef DEBUG
2478       assert (dfa->nexts[cur_node_idx] != -1);
2479 #endif
2480       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2481
2482       dest_state = mctx->state_log[dest_idx];
2483       if (dest_state == NULL)
2484         dest_nodes = *new_nodes;
2485       else
2486         {
2487           err = re_node_set_init_union (&dest_nodes,
2488                                         dest_state->entrance_nodes, new_nodes);
2489           if (BE (err != REG_NOERROR, 0))
2490             return err;
2491         }
2492       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2493       mctx->state_log[dest_idx]
2494         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2495       if (dest_state != NULL)
2496         re_node_set_free (&dest_nodes);
2497       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2498         return err;
2499     }
2500   return REG_NOERROR;
2501 }
2502 #endif /* RE_ENABLE_I18N */
2503
2504 static reg_errcode_t
2505 internal_function
2506 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2507 {
2508   re_dfa_t *const dfa = mctx->dfa;
2509   reg_errcode_t err;
2510   int i;
2511   int cur_str_idx = re_string_cur_idx (&mctx->input);
2512
2513   for (i = 0; i < nodes->nelem; ++i)
2514     {
2515       int dest_str_idx, prev_nelem, bkc_idx;
2516       int node_idx = nodes->elems[i];
2517       unsigned int context;
2518       const re_token_t *node = dfa->nodes + node_idx;
2519       re_node_set *new_dest_nodes;
2520
2521       /* Check whether `node' is a backreference or not.  */
2522       if (node->type != OP_BACK_REF)
2523         continue;
2524
2525       if (node->constraint)
2526         {
2527           context = re_string_context_at (&mctx->input, cur_str_idx,
2528                                           mctx->eflags);
2529           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2530             continue;
2531         }
2532
2533       /* `node' is a backreference.
2534          Check the substring which the substring matched.  */
2535       bkc_idx = mctx->nbkref_ents;
2536       err = get_subexp (mctx, node_idx, cur_str_idx);
2537       if (BE (err != REG_NOERROR, 0))
2538         goto free_return;
2539
2540       /* And add the epsilon closures (which is `new_dest_nodes') of
2541          the backreference to appropriate state_log.  */
2542 #ifdef DEBUG
2543       assert (dfa->nexts[node_idx] != -1);
2544 #endif
2545       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2546         {
2547           int subexp_len;
2548           re_dfastate_t *dest_state;
2549           struct re_backref_cache_entry *bkref_ent;
2550           bkref_ent = mctx->bkref_ents + bkc_idx;
2551           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2552             continue;
2553           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2554           new_dest_nodes = (subexp_len == 0
2555                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2556                             : dfa->eclosures + dfa->nexts[node_idx]);
2557           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2558                           - bkref_ent->subexp_from);
2559           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2560                                           mctx->eflags);
2561           dest_state = mctx->state_log[dest_str_idx];
2562           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2563                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2564           /* Add `new_dest_node' to state_log.  */
2565           if (dest_state == NULL)
2566             {
2567               mctx->state_log[dest_str_idx]
2568                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2569                                             context);
2570               if (BE (mctx->state_log[dest_str_idx] == NULL
2571                       && err != REG_NOERROR, 0))
2572                 goto free_return;
2573             }
2574           else
2575             {
2576               re_node_set dest_nodes;
2577               err = re_node_set_init_union (&dest_nodes,
2578                                             dest_state->entrance_nodes,
2579                                             new_dest_nodes);
2580               if (BE (err != REG_NOERROR, 0))
2581                 {
2582                   re_node_set_free (&dest_nodes);
2583                   goto free_return;
2584                 }
2585               mctx->state_log[dest_str_idx]
2586                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2587               re_node_set_free (&dest_nodes);
2588               if (BE (mctx->state_log[dest_str_idx] == NULL
2589                       && err != REG_NOERROR, 0))
2590                 goto free_return;
2591             }
2592           /* We need to check recursively if the backreference can epsilon
2593              transit.  */
2594           if (subexp_len == 0
2595               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2596             {
2597               err = check_subexp_matching_top (mctx, new_dest_nodes,
2598                                                cur_str_idx);
2599               if (BE (err != REG_NOERROR, 0))
2600                 goto free_return;
2601               err = transit_state_bkref (mctx, new_dest_nodes);
2602               if (BE (err != REG_NOERROR, 0))
2603                 goto free_return;
2604             }
2605         }
2606     }
2607   err = REG_NOERROR;
2608  free_return:
2609   return err;
2610 }
2611
2612 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2613    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2614    Note that we might collect inappropriate candidates here.
2615    However, the cost of checking them strictly here is too high, then we
2616    delay these checking for prune_impossible_nodes().  */
2617
2618 static reg_errcode_t
2619 internal_function
2620 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2621 {
2622   re_dfa_t *const dfa = mctx->dfa;
2623   int subexp_num, sub_top_idx;
2624   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2625   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2626   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2627   if (cache_idx != -1)
2628     {
2629       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2630       do
2631         if (entry->node == bkref_node)
2632           return REG_NOERROR; /* We already checked it.  */
2633       while (entry++->more);
2634     }
2635
2636   subexp_num = dfa->nodes[bkref_node].opr.idx;
2637
2638   /* For each sub expression  */
2639   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2640     {
2641       reg_errcode_t err;
2642       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2643       re_sub_match_last_t *sub_last;
2644       int sub_last_idx, sl_str, bkref_str_off;
2645
2646       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2647         continue; /* It isn't related.  */
2648
2649       sl_str = sub_top->str_idx;
2650       bkref_str_off = bkref_str_idx;
2651       /* At first, check the last node of sub expressions we already
2652          evaluated.  */
2653       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2654         {
2655           int sl_str_diff;
2656           sub_last = sub_top->lasts[sub_last_idx];
2657           sl_str_diff = sub_last->str_idx - sl_str;
2658           /* The matched string by the sub expression match with the substring
2659              at the back reference?  */
2660           if (sl_str_diff > 0)
2661             {
2662               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2663                 {
2664                   /* Not enough chars for a successful match.  */
2665                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2666                     break;
2667
2668                   err = clean_state_log_if_needed (mctx,
2669                                                    bkref_str_off
2670                                                    + sl_str_diff);
2671                   if (BE (err != REG_NOERROR, 0))
2672                     return err;
2673                   buf = (const char *) re_string_get_buffer (&mctx->input);
2674                 }
2675               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2676                 break; /* We don't need to search this sub expression any more.  */
2677             }
2678           bkref_str_off += sl_str_diff;
2679           sl_str += sl_str_diff;
2680           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2681                                 bkref_str_idx);
2682
2683           /* Reload buf, since the preceding call might have reallocated
2684              the buffer.  */
2685           buf = (const char *) re_string_get_buffer (&mctx->input);
2686
2687           if (err == REG_NOMATCH)
2688             continue;
2689           if (BE (err != REG_NOERROR, 0))
2690             return err;
2691         }
2692
2693       if (sub_last_idx < sub_top->nlasts)
2694         continue;
2695       if (sub_last_idx > 0)
2696         ++sl_str;
2697       /* Then, search for the other last nodes of the sub expression.  */
2698       for (; sl_str <= bkref_str_idx; ++sl_str)
2699         {
2700           int cls_node, sl_str_off;
2701           const re_node_set *nodes;
2702           sl_str_off = sl_str - sub_top->str_idx;
2703           /* The matched string by the sub expression match with the substring
2704              at the back reference?  */
2705           if (sl_str_off > 0)
2706             {
2707               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2708                 {
2709                   /* If we are at the end of the input, we cannot match.  */
2710                   if (bkref_str_off >= mctx->input.len)
2711                     break;
2712
2713                   err = extend_buffers (mctx);
2714                   if (BE (err != REG_NOERROR, 0))
2715                     return err;
2716
2717                   buf = (const char *) re_string_get_buffer (&mctx->input);
2718                 }
2719               if (buf [bkref_str_off++] != buf[sl_str - 1])
2720                 break; /* We don't need to search this sub expression
2721                           any more.  */
2722             }
2723           if (mctx->state_log[sl_str] == NULL)
2724             continue;
2725           /* Does this state have a ')' of the sub expression?  */
2726           nodes = &mctx->state_log[sl_str]->nodes;
2727           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2728           if (cls_node == -1)
2729             continue; /* No.  */
2730           if (sub_top->path == NULL)
2731             {
2732               sub_top->path = re_calloc (state_array_t,
2733                                          sl_str - sub_top->str_idx + 1);
2734               if (sub_top->path == NULL)
2735                 return REG_ESPACE;
2736             }
2737           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2738              in the current context?  */
2739           err = check_arrival (mctx, sub_top->path, sub_top->node,
2740                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2741           if (err == REG_NOMATCH)
2742               continue;
2743           if (BE (err != REG_NOERROR, 0))
2744               return err;
2745           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2746           if (BE (sub_last == NULL, 0))
2747             return REG_ESPACE;
2748           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2749                                 bkref_str_idx);
2750           if (err == REG_NOMATCH)
2751             continue;
2752         }
2753     }
2754   return REG_NOERROR;
2755 }
2756
2757 /* Helper functions for get_subexp().  */
2758
2759 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2760    If it can arrive, register the sub expression expressed with SUB_TOP
2761    and SUB_LAST.  */
2762
2763 static reg_errcode_t
2764 internal_function
2765 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2766                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2767 {
2768   reg_errcode_t err;
2769   int to_idx;
2770   /* Can the subexpression arrive the back reference?  */
2771   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2772                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2773   if (err != REG_NOERROR)
2774     return err;
2775   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2776                              sub_last->str_idx);
2777   if (BE (err != REG_NOERROR, 0))
2778     return err;
2779   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2780   return clean_state_log_if_needed (mctx, to_idx);
2781 }
2782
2783 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2784    Search '(' if FL_OPEN, or search ')' otherwise.
2785    TODO: This function isn't efficient...
2786          Because there might be more than one nodes whose types are
2787          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2788          nodes.
2789          E.g. RE: (a){2}  */
2790
2791 static int
2792 internal_function
2793 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2794                   int subexp_idx, int type)
2795 {
2796   int cls_idx;
2797   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2798     {
2799       int cls_node = nodes->elems[cls_idx];
2800       const re_token_t *node = dfa->nodes + cls_node;
2801       if (node->type == type
2802           && node->opr.idx == subexp_idx)
2803         return cls_node;
2804     }
2805   return -1;
2806 }
2807
2808 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2809    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2810    heavily reused.
2811    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2812
2813 static reg_errcode_t
2814 internal_function
2815 check_arrival (re_match_context_t *mctx, state_array_t *path,
2816                int top_node, int top_str, int last_node, int last_str,
2817                int type)
2818 {
2819   re_dfa_t *const dfa = mctx->dfa;
2820   reg_errcode_t err;
2821   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2822   re_dfastate_t *cur_state = NULL;
2823   re_node_set *cur_nodes, next_nodes;
2824   re_dfastate_t **backup_state_log;
2825   unsigned int context;
2826
2827   subexp_num = dfa->nodes[top_node].opr.idx;
2828   /* Extend the buffer if we need.  */
2829   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2830     {
2831       re_dfastate_t **new_array;
2832       int old_alloc = path->alloc;
2833       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2834       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2835       if (new_array == NULL)
2836         {
2837           path->alloc = old_alloc;
2838           return REG_ESPACE;
2839         }
2840       path->array = new_array;
2841       memset (new_array + old_alloc, '\0',
2842               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2843     }
2844
2845   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2846
2847   /* Temporary modify MCTX.  */
2848   backup_state_log = mctx->state_log;
2849   backup_cur_idx = mctx->input.cur_idx;
2850   mctx->state_log = path->array;
2851   mctx->input.cur_idx = str_idx;
2852
2853   /* Setup initial node set.  */
2854   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2855   if (str_idx == top_str)
2856     {
2857       err = re_node_set_init_1 (&next_nodes, top_node);
2858       if (BE (err != REG_NOERROR, 0))
2859         return err;
2860       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2861       if (BE (err != REG_NOERROR, 0))
2862         {
2863           re_node_set_free (&next_nodes);
2864           return err;
2865         }
2866     }
2867   else
2868     {
2869       cur_state = mctx->state_log[str_idx];
2870       if (cur_state && cur_state->has_backref)
2871         {
2872           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2873           if (BE ( err != REG_NOERROR, 0))
2874             return err;
2875         }
2876       else
2877         re_node_set_init_empty (&next_nodes);
2878     }
2879   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2880     {
2881       if (next_nodes.nelem)
2882         {
2883           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2884                                     subexp_num, type);
2885           if (BE ( err != REG_NOERROR, 0))
2886             {
2887               re_node_set_free (&next_nodes);
2888               return err;
2889             }
2890         }
2891       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2892       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2893         {
2894           re_node_set_free (&next_nodes);
2895           return err;
2896         }
2897       mctx->state_log[str_idx] = cur_state;
2898     }
2899
2900   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2901     {
2902       re_node_set_empty (&next_nodes);
2903       if (mctx->state_log[str_idx + 1])
2904         {
2905           err = re_node_set_merge (&next_nodes,
2906                                    &mctx->state_log[str_idx + 1]->nodes);
2907           if (BE (err != REG_NOERROR, 0))
2908             {
2909               re_node_set_free (&next_nodes);
2910               return err;
2911             }
2912         }
2913       if (cur_state)
2914         {
2915           err = check_arrival_add_next_nodes (mctx, str_idx,
2916                                               &cur_state->non_eps_nodes, &next_nodes);
2917           if (BE (err != REG_NOERROR, 0))
2918             {
2919               re_node_set_free (&next_nodes);
2920               return err;
2921             }
2922         }
2923       ++str_idx;
2924       if (next_nodes.nelem)
2925         {
2926           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2927           if (BE (err != REG_NOERROR, 0))
2928             {
2929               re_node_set_free (&next_nodes);
2930               return err;
2931             }
2932           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2933                                     subexp_num, type);
2934           if (BE ( err != REG_NOERROR, 0))
2935             {
2936               re_node_set_free (&next_nodes);
2937               return err;
2938             }
2939         }
2940       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2941       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2942       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2943         {
2944           re_node_set_free (&next_nodes);
2945           return err;
2946         }
2947       mctx->state_log[str_idx] = cur_state;
2948       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2949     }
2950   re_node_set_free (&next_nodes);
2951   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2952                : &mctx->state_log[last_str]->nodes);
2953   path->next_idx = str_idx;
2954
2955   /* Fix MCTX.  */
2956   mctx->state_log = backup_state_log;
2957   mctx->input.cur_idx = backup_cur_idx;
2958
2959   /* Then check the current node set has the node LAST_NODE.  */
2960   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2961     return REG_NOERROR;
2962
2963   return REG_NOMATCH;
2964 }
2965
2966 /* Helper functions for check_arrival.  */
2967
2968 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2969    to NEXT_NODES.
2970    TODO: This function is similar to the functions transit_state*(),
2971          however this function has many additional works.
2972          Can't we unify them?  */
2973
2974 static reg_errcode_t
2975 internal_function
2976 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2977                               re_node_set *cur_nodes,
2978                               re_node_set *next_nodes)
2979 {
2980   re_dfa_t *const dfa = mctx->dfa;
2981   int result;
2982   int cur_idx;
2983   reg_errcode_t err;
2984   re_node_set union_set;
2985   re_node_set_init_empty (&union_set);
2986   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2987     {
2988       int naccepted = 0;
2989       int cur_node = cur_nodes->elems[cur_idx];
2990 #ifdef DEBUG
2991       re_token_type_t type = dfa->nodes[cur_node].type;
2992       assert (!IS_EPSILON_NODE (type));
2993 #endif
2994 #ifdef RE_ENABLE_I18N
2995       /* If the node may accept `multi byte'.  */
2996       if (dfa->nodes[cur_node].accept_mb)
2997         {
2998           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2999                                                str_idx);
3000           if (naccepted > 1)
3001             {
3002               re_dfastate_t *dest_state;
3003               int next_node = dfa->nexts[cur_node];
3004               int next_idx = str_idx + naccepted;
3005               dest_state = mctx->state_log[next_idx];
3006               re_node_set_empty (&union_set);
3007               if (dest_state)
3008                 {
3009                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3010                   if (BE (err != REG_NOERROR, 0))
3011                     {
3012                       re_node_set_free (&union_set);
3013                       return err;
3014                     }
3015                 }
3016               result = re_node_set_insert (&union_set, next_node);
3017               if (BE (result < 0, 0))
3018                 {
3019                   re_node_set_free (&union_set);
3020                   return REG_ESPACE;
3021                 }
3022               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3023                                                             &union_set);
3024               if (BE (mctx->state_log[next_idx] == NULL
3025                       && err != REG_NOERROR, 0))
3026                 {
3027                   re_node_set_free (&union_set);
3028                   return err;
3029                 }
3030             }
3031         }
3032 #endif /* RE_ENABLE_I18N */
3033       if (naccepted
3034           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3035         {
3036           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3037           if (BE (result < 0, 0))
3038             {
3039               re_node_set_free (&union_set);
3040               return REG_ESPACE;
3041             }
3042         }
3043     }
3044   re_node_set_free (&union_set);
3045   return REG_NOERROR;
3046 }
3047
3048 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3049    CUR_NODES, however exclude the nodes which are:
3050     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3051     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3052 */
3053
3054 static reg_errcode_t
3055 internal_function
3056 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3057                           int ex_subexp, int type)
3058 {
3059   reg_errcode_t err;
3060   int idx, outside_node;
3061   re_node_set new_nodes;
3062 #ifdef DEBUG
3063   assert (cur_nodes->nelem);
3064 #endif
3065   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3066   if (BE (err != REG_NOERROR, 0))
3067     return err;
3068   /* Create a new node set NEW_NODES with the nodes which are epsilon
3069      closures of the node in CUR_NODES.  */
3070
3071   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3072     {
3073       int cur_node = cur_nodes->elems[idx];
3074       re_node_set *eclosure = dfa->eclosures + cur_node;
3075       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3076       if (outside_node == -1)
3077         {
3078           /* There are no problematic nodes, just merge them.  */
3079           err = re_node_set_merge (&new_nodes, eclosure);
3080           if (BE (err != REG_NOERROR, 0))
3081             {
3082               re_node_set_free (&new_nodes);
3083               return err;
3084             }
3085         }
3086       else
3087         {
3088           /* There are problematic nodes, re-calculate incrementally.  */
3089           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3090                                               ex_subexp, type);
3091           if (BE (err != REG_NOERROR, 0))
3092             {
3093               re_node_set_free (&new_nodes);
3094               return err;
3095             }
3096         }
3097     }
3098   re_node_set_free (cur_nodes);
3099   *cur_nodes = new_nodes;
3100   return REG_NOERROR;
3101 }
3102
3103 /* Helper function for check_arrival_expand_ecl.
3104    Check incrementally the epsilon closure of TARGET, and if it isn't
3105    problematic append it to DST_NODES.  */
3106
3107 static reg_errcode_t
3108 internal_function
3109 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3110                               int target, int ex_subexp, int type)
3111 {
3112   int cur_node;
3113   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3114     {
3115       int err;
3116
3117       if (dfa->nodes[cur_node].type == type
3118           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3119         {
3120           if (type == OP_CLOSE_SUBEXP)
3121             {
3122               err = re_node_set_insert (dst_nodes, cur_node);
3123               if (BE (err == -1, 0))
3124                 return REG_ESPACE;
3125             }
3126           break;
3127         }
3128       err = re_node_set_insert (dst_nodes, cur_node);
3129       if (BE (err == -1, 0))
3130         return REG_ESPACE;
3131       if (dfa->edests[cur_node].nelem == 0)
3132         break;
3133       if (dfa->edests[cur_node].nelem == 2)
3134         {
3135           reg_errcode_t ret =
3136             check_arrival_expand_ecl_sub (dfa, dst_nodes,
3137                                           dfa->edests[cur_node].elems[1],
3138                                           ex_subexp, type);
3139           if (BE (ret != REG_NOERROR, 0))
3140             return ret;
3141         }
3142       cur_node = dfa->edests[cur_node].elems[0];
3143     }
3144   return REG_NOERROR;
3145 }
3146
3147
3148 /* For all the back references in the current state, calculate the
3149    destination of the back references by the appropriate entry
3150    in MCTX->BKREF_ENTS.  */
3151
3152 static reg_errcode_t
3153 internal_function
3154 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3155                     int cur_str, int subexp_num, int type)
3156 {
3157   re_dfa_t *const dfa = mctx->dfa;
3158   reg_errcode_t err;
3159   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3160   struct re_backref_cache_entry *ent;
3161
3162   if (cache_idx_start == -1)
3163     return REG_NOERROR;
3164
3165  restart:
3166   ent = mctx->bkref_ents + cache_idx_start;
3167   do
3168     {
3169       int to_idx, next_node;
3170
3171       /* Is this entry ENT is appropriate?  */
3172       if (!re_node_set_contains (cur_nodes, ent->node))
3173         continue; /* No.  */
3174
3175       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3176       /* Calculate the destination of the back reference, and append it
3177          to MCTX->STATE_LOG.  */
3178       if (to_idx == cur_str)
3179         {
3180           /* The backreference did epsilon transit, we must re-check all the
3181              node in the current state.  */
3182           re_node_set new_dests;
3183           reg_errcode_t err2, err3;
3184           next_node = dfa->edests[ent->node].elems[0];
3185           if (re_node_set_contains (cur_nodes, next_node))
3186             continue;
3187           err = re_node_set_init_1 (&new_dests, next_node);
3188           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3189           err3 = re_node_set_merge (cur_nodes, &new_dests);
3190           re_node_set_free (&new_dests);
3191           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3192                   || err3 != REG_NOERROR, 0))
3193             {
3194               err = (err != REG_NOERROR ? err
3195                      : (err2 != REG_NOERROR ? err2 : err3));
3196               return err;
3197             }
3198           /* TODO: It is still inefficient...  */
3199           goto restart;
3200         }
3201       else
3202         {
3203           re_node_set union_set;
3204           next_node = dfa->nexts[ent->node];
3205           if (mctx->state_log[to_idx])
3206             {
3207               int ret;
3208               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3209                                         next_node))
3210                 continue;
3211               err = re_node_set_init_copy (&union_set,
3212                                            &mctx->state_log[to_idx]->nodes);
3213               ret = re_node_set_insert (&union_set, next_node);
3214               if (BE (err != REG_NOERROR || ret < 0, 0))
3215                 {
3216                   re_node_set_free (&union_set);
3217                   err = err != REG_NOERROR ? err : REG_ESPACE;
3218                   return err;
3219                 }
3220             }
3221           else
3222             {
3223               err = re_node_set_init_1 (&union_set, next_node);
3224               if (BE (err != REG_NOERROR, 0))
3225                 return err;
3226             }
3227           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3228           re_node_set_free (&union_set);
3229           if (BE (mctx->state_log[to_idx] == NULL
3230                   && err != REG_NOERROR, 0))
3231             return err;
3232         }
3233     }
3234   while (ent++->more);
3235   return REG_NOERROR;
3236 }
3237
3238 /* Build transition table for the state.
3239    Return 1 if succeeded, otherwise return NULL.  */
3240
3241 static int
3242 internal_function
3243 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3244 {
3245   reg_errcode_t err;
3246   int i, j, ch, need_word_trtable = 0;
3247   unsigned int elem, mask;
3248   int dests_node_malloced = 0, dest_states_malloced = 0;
3249   int ndests; /* Number of the destination states from `state'.  */
3250   re_dfastate_t **trtable;
3251   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3252   re_node_set follows, *dests_node;
3253   bitset *dests_ch;
3254   bitset acceptable;
3255
3256   /* We build DFA states which corresponds to the destination nodes
3257      from `state'.  `dests_node[i]' represents the nodes which i-th
3258      destination state contains, and `dests_ch[i]' represents the
3259      characters which i-th destination state accepts.  */
3260   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3261     dests_node = (re_node_set *)
3262       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3263   else
3264     {
3265       dests_node = (re_node_set *)
3266         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3267       if (BE (dests_node == NULL, 0))
3268         return 0;
3269       dests_node_malloced = 1;
3270     }
3271   dests_ch = (bitset *) (dests_node + SBC_MAX);
3272
3273   /* Initialize transiton table.  */
3274   state->word_trtable = state->trtable = NULL;
3275
3276   /* At first, group all nodes belonging to `state' into several
3277      destinations.  */
3278   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3279   if (BE (ndests <= 0, 0))
3280     {
3281       if (dests_node_malloced)
3282         free (dests_node);
3283       /* Return 0 in case of an error, 1 otherwise.  */
3284       if (ndests == 0)
3285         {
3286           state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3287           return 1;
3288         }
3289       return 0;
3290     }
3291
3292   err = re_node_set_alloc (&follows, ndests + 1);
3293   if (BE (err != REG_NOERROR, 0))
3294     goto out_free;
3295
3296   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3297                          + ndests * 3 * sizeof (re_dfastate_t *)))
3298     dest_states = (re_dfastate_t **)
3299       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3300   else
3301     {
3302       dest_states = (re_dfastate_t **)
3303         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3304       if (BE (dest_states == NULL, 0))
3305         {
3306 out_free:
3307           if (dest_states_malloced)
3308             free (dest_states);
3309           re_node_set_free (&follows);
3310           for (i = 0; i < ndests; ++i)
3311             re_node_set_free (dests_node + i);
3312           if (dests_node_malloced)
3313             free (dests_node);
3314           return 0;
3315         }
3316       dest_states_malloced = 1;
3317     }
3318   dest_states_word = dest_states + ndests;
3319   dest_states_nl = dest_states_word + ndests;
3320   bitset_empty (acceptable);
3321
3322   /* Then build the states for all destinations.  */
3323   for (i = 0; i < ndests; ++i)
3324     {
3325       int next_node;
3326       re_node_set_empty (&follows);
3327       /* Merge the follows of this destination states.  */
3328       for (j = 0; j < dests_node[i].nelem; ++j)
3329         {
3330           next_node = dfa->nexts[dests_node[i].elems[j]];
3331           if (next_node != -1)
3332             {
3333               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3334               if (BE (err != REG_NOERROR, 0))
3335                 goto out_free;
3336             }
3337         }
3338       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3339       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3340         goto out_free;
3341       /* If the new state has context constraint,
3342          build appropriate states for these contexts.  */
3343       if (dest_states[i]->has_constraint)
3344         {
3345           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3346                                                           CONTEXT_WORD);
3347           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3348             goto out_free;
3349
3350           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3351             need_word_trtable = 1;
3352
3353           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3354                                                         CONTEXT_NEWLINE);
3355           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3356             goto out_free;
3357         }
3358       else
3359         {
3360           dest_states_word[i] = dest_states[i];
3361           dest_states_nl[i] = dest_states[i];
3362         }
3363       bitset_merge (acceptable, dests_ch[i]);
3364     }
3365
3366   if (!BE (need_word_trtable, 0))
3367     {
3368       /* We don't care about whether the following character is a word
3369          character, or we are in a single-byte character set so we can
3370          discern by looking at the character code: allocate a
3371          256-entry transition table.  */
3372       trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3373       if (BE (trtable == NULL, 0))
3374         goto out_free;
3375
3376       /* For all characters ch...:  */
3377       for (i = 0; i < BITSET_UINTS; ++i)
3378         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3379              elem;
3380              mask <<= 1, elem >>= 1, ++ch)
3381           if (BE (elem & 1, 0))
3382             {
3383               /* There must be exactly one destination which accepts
3384                  character ch.  See group_nodes_into_DFAstates.  */
3385               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3386                 ;
3387
3388               /* j-th destination accepts the word character ch.  */
3389               if (dfa->word_char[i] & mask)
3390                 trtable[ch] = dest_states_word[j];
3391               else
3392                 trtable[ch] = dest_states[j];
3393             }
3394     }
3395   else
3396     {
3397       /* We care about whether the following character is a word
3398          character, and we are in a multi-byte character set: discern
3399          by looking at the character code: build two 256-entry
3400          transition tables, one starting at trtable[0] and one
3401          starting at trtable[SBC_MAX].  */
3402       trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3403       if (BE (trtable == NULL, 0))
3404         goto out_free;
3405
3406       /* For all characters ch...:  */
3407       for (i = 0; i < BITSET_UINTS; ++i)
3408         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3409              elem;
3410              mask <<= 1, elem >>= 1, ++ch)
3411           if (BE (elem & 1, 0))
3412             {
3413               /* There must be exactly one destination which accepts
3414                  character ch.  See group_nodes_into_DFAstates.  */
3415               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3416                 ;
3417
3418               /* j-th destination accepts the word character ch.  */
3419               trtable[ch] = dest_states[j];
3420               trtable[ch + SBC_MAX] = dest_states_word[j];
3421             }
3422     }
3423
3424   /* new line */
3425   if (bitset_contain (acceptable, NEWLINE_CHAR))
3426     {
3427       /* The current state accepts newline character.  */
3428       for (j = 0; j < ndests; ++j)
3429         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3430           {
3431             /* k-th destination accepts newline character.  */
3432             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3433             if (need_word_trtable)
3434               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3435             /* There must be only one destination which accepts
3436                newline.  See group_nodes_into_DFAstates.  */
3437             break;
3438           }
3439     }
3440
3441   if (dest_states_malloced)
3442     free (dest_states);
3443
3444   re_node_set_free (&follows);
3445   for (i = 0; i < ndests; ++i)
3446     re_node_set_free (dests_node + i);
3447
3448   if (dests_node_malloced)
3449     free (dests_node);
3450
3451   return 1;
3452 }
3453
3454 /* Group all nodes belonging to STATE into several destinations.
3455    Then for all destinations, set the nodes belonging to the destination
3456    to DESTS_NODE[i] and set the characters accepted by the destination
3457    to DEST_CH[i].  This function return the number of destinations.  */
3458
3459 static int
3460 internal_function
3461 group_nodes_into_DFAstates (re_dfa_t *dfa, const re_dfastate_t *state,
3462                             re_node_set *dests_node, bitset *dests_ch)
3463 {
3464   reg_errcode_t err;
3465   int result;
3466   int i, j, k;
3467   int ndests; /* Number of the destinations from `state'.  */
3468   bitset accepts; /* Characters a node can accept.  */
3469   const re_node_set *cur_nodes = &state->nodes;
3470   bitset_empty (accepts);
3471   ndests = 0;
3472
3473   /* For all the nodes belonging to `state',  */
3474   for (i = 0; i < cur_nodes->nelem; ++i)
3475     {
3476       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3477       re_token_type_t type = node->type;
3478       unsigned int constraint = node->constraint;
3479
3480       /* Enumerate all single byte character this node can accept.  */
3481       if (type == CHARACTER)
3482         bitset_set (accepts, node->opr.c);
3483       else if (type == SIMPLE_BRACKET)
3484         {
3485           bitset_merge (accepts, node->opr.sbcset);
3486         }
3487       else if (type == OP_PERIOD)
3488         {
3489 #ifdef RE_ENABLE_I18N
3490           if (dfa->mb_cur_max > 1)
3491             bitset_merge (accepts, dfa->sb_char);
3492           else
3493 #endif
3494             bitset_set_all (accepts);
3495           if (!(dfa->syntax & REG_DOT_NEWLINE))
3496             bitset_clear (accepts, '\n');
3497           if (dfa->syntax & REG_DOT_NOT_NULL)
3498             bitset_clear (accepts, '\0');
3499         }
3500 #ifdef RE_ENABLE_I18N
3501       else if (type == OP_UTF8_PERIOD)
3502         {
3503           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3504           if (!(dfa->syntax & REG_DOT_NEWLINE))
3505             bitset_clear (accepts, '\n');
3506           if (dfa->syntax & REG_DOT_NOT_NULL)
3507             bitset_clear (accepts, '\0');
3508         }
3509 #endif
3510       else
3511         continue;
3512
3513       /* Check the `accepts' and sift the characters which are not
3514          match it the context.  */
3515       if (constraint)
3516         {
3517           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3518             {
3519               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3520               bitset_empty (accepts);
3521               if (accepts_newline)
3522                 bitset_set (accepts, NEWLINE_CHAR);
3523               else
3524                 continue;
3525             }
3526           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3527             {
3528               bitset_empty (accepts);
3529               continue;
3530             }
3531
3532           if (constraint & NEXT_WORD_CONSTRAINT)
3533             {
3534               unsigned int any_set = 0;
3535               if (type == CHARACTER && !node->word_char)
3536                 {
3537                   bitset_empty (accepts);
3538                   continue;
3539                 }
3540 #ifdef RE_ENABLE_I18N
3541               if (dfa->mb_cur_max > 1)
3542                 for (j = 0; j < BITSET_UINTS; ++j)
3543                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3544               else
3545 #endif
3546                 for (j = 0; j < BITSET_UINTS; ++j)
3547                   any_set |= (accepts[j] &= dfa->word_char[j]);
3548               if (!any_set)
3549                 continue;
3550             }
3551           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3552             {
3553               unsigned int any_set = 0;
3554               if (type == CHARACTER && node->word_char)
3555                 {
3556                   bitset_empty (accepts);
3557                   continue;
3558                 }
3559 #ifdef RE_ENABLE_I18N
3560               if (dfa->mb_cur_max > 1)
3561                 for (j = 0; j < BITSET_UINTS; ++j)
3562                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3563               else
3564 #endif
3565                 for (j = 0; j < BITSET_UINTS; ++j)
3566                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3567               if (!any_set)
3568                 continue;
3569             }
3570         }
3571
3572       /* Then divide `accepts' into DFA states, or create a new
3573          state.  Above, we make sure that accepts is not empty.  */
3574       for (j = 0; j < ndests; ++j)
3575         {
3576           bitset intersec; /* Intersection sets, see below.  */
3577           bitset remains;
3578           /* Flags, see below.  */
3579           int has_intersec, not_subset, not_consumed;
3580
3581           /* Optimization, skip if this state doesn't accept the character.  */
3582           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3583             continue;
3584
3585           /* Enumerate the intersection set of this state and `accepts'.  */
3586           has_intersec = 0;
3587           for (k = 0; k < BITSET_UINTS; ++k)
3588             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3589           /* And skip if the intersection set is empty.  */
3590           if (!has_intersec)
3591             continue;
3592
3593           /* Then check if this state is a subset of `accepts'.  */
3594           not_subset = not_consumed = 0;
3595           for (k = 0; k < BITSET_UINTS; ++k)
3596             {
3597               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3598               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3599             }
3600
3601           /* If this state isn't a subset of `accepts', create a
3602              new group state, which has the `remains'. */
3603           if (not_subset)
3604             {
3605               bitset_copy (dests_ch[ndests], remains);
3606               bitset_copy (dests_ch[j], intersec);
3607               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3608               if (BE (err != REG_NOERROR, 0))
3609                 goto error_return;
3610               ++ndests;
3611             }
3612
3613           /* Put the position in the current group. */
3614           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3615           if (BE (result < 0, 0))
3616             goto error_return;
3617
3618           /* If all characters are consumed, go to next node. */
3619           if (!not_consumed)
3620             break;
3621         }
3622       /* Some characters remain, create a new group. */
3623       if (j == ndests)
3624         {
3625           bitset_copy (dests_ch[ndests], accepts);
3626           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3627           if (BE (err != REG_NOERROR, 0))
3628             goto error_return;
3629           ++ndests;
3630           bitset_empty (accepts);
3631         }
3632     }
3633   return ndests;
3634  error_return:
3635   for (j = 0; j < ndests; ++j)
3636     re_node_set_free (dests_node + j);
3637   return -1;
3638 }
3639
3640 #ifdef RE_ENABLE_I18N
3641 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3642    Return the number of the bytes the node accepts.
3643    STR_IDX is the current index of the input string.
3644
3645    This function handles the nodes which can accept one character, or
3646    one collating element like '.', '[a-z]', opposite to the other nodes
3647    can only accept one byte.  */
3648
3649 static int
3650 internal_function
3651 check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3652                          const re_string_t *input, int str_idx)
3653 {
3654   const re_token_t *node = dfa->nodes + node_idx;
3655   int char_len, elem_len;
3656   int i;
3657
3658   if (BE (node->type == OP_UTF8_PERIOD, 0))
3659     {
3660       unsigned char c = re_string_byte_at (input, str_idx), d;
3661       if (BE (c < 0xc2, 1))
3662         return 0;
3663
3664       if (str_idx + 2 > input->len)
3665         return 0;
3666
3667       d = re_string_byte_at (input, str_idx + 1);
3668       if (c < 0xe0)
3669         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3670       else if (c < 0xf0)
3671         {
3672           char_len = 3;
3673           if (c == 0xe0 && d < 0xa0)
3674             return 0;
3675         }
3676       else if (c < 0xf8)
3677         {
3678           char_len = 4;
3679           if (c == 0xf0 && d < 0x90)
3680             return 0;
3681         }
3682       else if (c < 0xfc)
3683         {
3684           char_len = 5;
3685           if (c == 0xf8 && d < 0x88)
3686             return 0;
3687         }
3688       else if (c < 0xfe)
3689         {
3690           char_len = 6;
3691           if (c == 0xfc && d < 0x84)
3692             return 0;
3693         }
3694       else
3695         return 0;
3696
3697       if (str_idx + char_len > input->len)
3698         return 0;
3699
3700       for (i = 1; i < char_len; ++i)
3701         {
3702           d = re_string_byte_at (input, str_idx + i);
3703           if (d < 0x80 || d > 0xbf)
3704             return 0;
3705         }
3706       return char_len;
3707     }
3708
3709   char_len = re_string_char_size_at (input, str_idx);
3710   if (node->type == OP_PERIOD)
3711     {
3712       if (char_len <= 1)
3713         return 0;
3714       /* FIXME: I don't think this if is needed, as both '\n'
3715          and '\0' are char_len == 1.  */
3716       /* '.' accepts any one character except the following two cases.  */
3717       if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3718            re_string_byte_at (input, str_idx) == '\n') ||
3719           ((dfa->syntax & REG_DOT_NOT_NULL) &&
3720            re_string_byte_at (input, str_idx) == '\0'))
3721         return 0;
3722       return char_len;
3723     }
3724
3725   elem_len = re_string_elem_size_at (input, str_idx);
3726   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3727     return 0;
3728
3729   if (node->type == COMPLEX_BRACKET)
3730     {
3731       const re_charset_t *cset = node->opr.mbcset;
3732 # ifdef _LIBC
3733       const unsigned char *pin
3734         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3735       int j;
3736       uint32_t nrules;
3737 # endif /* _LIBC */
3738       int match_len = 0;
3739       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3740                     ? re_string_wchar_at (input, str_idx) : 0);
3741
3742       /* match with multibyte character?  */
3743       for (i = 0; i < cset->nmbchars; ++i)
3744         if (wc == cset->mbchars[i])
3745           {
3746             match_len = char_len;
3747             goto check_node_accept_bytes_match;
3748           }
3749       /* match with character_class?  */
3750       for (i = 0; i < cset->nchar_classes; ++i)
3751         {
3752           wctype_t wt = cset->char_classes[i];
3753           if (__iswctype (wc, wt))
3754             {
3755               match_len = char_len;
3756               goto check_node_accept_bytes_match;
3757             }
3758         }
3759
3760 # ifdef _LIBC
3761       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3762       if (nrules != 0)
3763         {
3764           unsigned int in_collseq = 0;
3765           const int32_t *table, *indirect;
3766           const unsigned char *weights, *extra;
3767           const char *collseqwc;
3768           int32_t idx;
3769           /* This #include defines a local function!  */
3770 #  include <locale/weight.h>
3771
3772           /* match with collating_symbol?  */
3773           if (cset->ncoll_syms)
3774             extra = (const unsigned char *)
3775               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3776           for (i = 0; i < cset->ncoll_syms; ++i)
3777             {
3778               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3779               /* Compare the length of input collating element and
3780                  the length of current collating element.  */
3781               if (*coll_sym != elem_len)
3782                 continue;
3783               /* Compare each bytes.  */
3784               for (j = 0; j < *coll_sym; j++)
3785                 if (pin[j] != coll_sym[1 + j])
3786                   break;
3787               if (j == *coll_sym)
3788                 {
3789                   /* Match if every bytes is equal.  */
3790                   match_len = j;
3791                   goto check_node_accept_bytes_match;
3792                 }
3793             }
3794
3795           if (cset->nranges)
3796             {
3797               if (elem_len <= char_len)
3798                 {
3799                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3800                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3801                 }
3802               else
3803                 in_collseq = find_collation_sequence_value (pin, elem_len);
3804             }
3805           /* match with range expression?  */
3806           for (i = 0; i < cset->nranges; ++i)
3807             if (cset->range_starts[i] <= in_collseq
3808                 && in_collseq <= cset->range_ends[i])
3809               {
3810                 match_len = elem_len;
3811                 goto check_node_accept_bytes_match;
3812               }
3813
3814           /* match with equivalence_class?  */
3815           if (cset->nequiv_classes)
3816             {
3817               const unsigned char *cp = pin;
3818               table = (const int32_t *)
3819                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3820               weights = (const unsigned char *)
3821                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3822               extra = (const unsigned char *)
3823                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3824               indirect = (const int32_t *)
3825                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3826               idx = findidx (&cp);
3827               if (idx > 0)
3828                 for (i = 0; i < cset->nequiv_classes; ++i)
3829                   {
3830                     int32_t equiv_class_idx = cset->equiv_classes[i];
3831                     size_t weight_len = weights[idx];
3832                     if (weight_len == weights[equiv_class_idx])
3833                       {
3834                         int cnt = 0;
3835                         while (cnt <= weight_len
3836                                && (weights[equiv_class_idx + 1 + cnt]
3837                                    == weights[idx + 1 + cnt]))
3838                           ++cnt;
3839                         if (cnt > weight_len)
3840                           {
3841                             match_len = elem_len;
3842                             goto check_node_accept_bytes_match;
3843                           }
3844                       }
3845                   }
3846             }
3847         }
3848       else
3849 # endif /* _LIBC */
3850         {
3851           /* match with range expression?  */
3852 #if __GNUC__ >= 2
3853           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3854 #else
3855           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3856           cmp_buf[2] = wc;
3857 #endif
3858           for (i = 0; i < cset->nranges; ++i)
3859             {
3860               cmp_buf[0] = cset->range_starts[i];
3861               cmp_buf[4] = cset->range_ends[i];
3862               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3863                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3864                 {
3865                   match_len = char_len;
3866                   goto check_node_accept_bytes_match;
3867                 }
3868             }
3869         }
3870     check_node_accept_bytes_match:
3871       if (!cset->non_match)
3872         return match_len;
3873       else
3874         {
3875           if (match_len > 0)
3876             return 0;
3877           else
3878             return (elem_len > char_len) ? elem_len : char_len;
3879         }
3880     }
3881   return 0;
3882 }
3883
3884 # ifdef _LIBC
3885 static unsigned int
3886 find_collation_sequence_value (mbs, mbs_len)
3887     const unsigned char *mbs;
3888     size_t mbs_len;
3889 {
3890   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3891   if (nrules == 0)
3892     {
3893       if (mbs_len == 1)
3894         {
3895           /* No valid character.  Match it as a single byte character.  */
3896           const unsigned char *collseq = (const unsigned char *)
3897             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3898           return collseq[mbs[0]];
3899         }
3900       return UINT_MAX;
3901     }
3902   else
3903     {
3904       int32_t idx;
3905       const unsigned char *extra = (const unsigned char *)
3906         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3907       int32_t extrasize = (const unsigned char *)
3908         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3909
3910       for (idx = 0; idx < extrasize;)
3911         {
3912           int mbs_cnt, found = 0;
3913           int32_t elem_mbs_len;
3914           /* Skip the name of collating element name.  */
3915           idx = idx + extra[idx] + 1;
3916           elem_mbs_len = extra[idx++];
3917           if (mbs_len == elem_mbs_len)
3918             {
3919               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3920                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3921                   break;
3922               if (mbs_cnt == elem_mbs_len)
3923                 /* Found the entry.  */
3924                 found = 1;
3925             }
3926           /* Skip the byte sequence of the collating element.  */
3927           idx += elem_mbs_len;
3928           /* Adjust for the alignment.  */
3929           idx = (idx + 3) & ~3;
3930           /* Skip the collation sequence value.  */
3931           idx += sizeof (uint32_t);
3932           /* Skip the wide char sequence of the collating element.  */
3933           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3934           /* If we found the entry, return the sequence value.  */
3935           if (found)
3936             return *(uint32_t *) (extra + idx);
3937           /* Skip the collation sequence value.  */
3938           idx += sizeof (uint32_t);
3939         }
3940       return UINT_MAX;
3941     }
3942 }
3943 # endif /* _LIBC */
3944 #endif /* RE_ENABLE_I18N */
3945
3946 /* Check whether the node accepts the byte which is IDX-th
3947    byte of the INPUT.  */
3948
3949 static int
3950 internal_function
3951 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3952                    int idx)
3953 {
3954   unsigned char ch;
3955   ch = re_string_byte_at (&mctx->input, idx);
3956   switch (node->type)
3957     {
3958     case CHARACTER:
3959       if (node->opr.c != ch)
3960         return 0;
3961       break;
3962
3963     case SIMPLE_BRACKET:
3964       if (!bitset_contain (node->opr.sbcset, ch))
3965         return 0;
3966       break;
3967
3968 #ifdef RE_ENABLE_I18N
3969     case OP_UTF8_PERIOD:
3970       if (ch >= 0x80)
3971         return 0;
3972       /* FALLTHROUGH */
3973 #endif
3974     case OP_PERIOD:
3975       if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
3976           || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
3977         return 0;
3978       break;
3979
3980     default:
3981       return 0;
3982     }
3983
3984   if (node->constraint)
3985     {
3986       /* The node has constraints.  Check whether the current context
3987          satisfies the constraints.  */
3988       unsigned int context = re_string_context_at (&mctx->input, idx,
3989                                                    mctx->eflags);
3990       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3991         return 0;
3992     }
3993
3994   return 1;
3995 }
3996
3997 /* Extend the buffers, if the buffers have run out.  */
3998
3999 static reg_errcode_t
4000 internal_function
4001 extend_buffers (re_match_context_t *mctx)
4002 {
4003   reg_errcode_t ret;
4004   re_string_t *pstr = &mctx->input;
4005
4006   /* Double the lengthes of the buffers.  */
4007   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4008   if (BE (ret != REG_NOERROR, 0))
4009     return ret;
4010
4011   if (mctx->state_log != NULL)
4012     {
4013       /* And double the length of state_log.  */
4014       /* XXX We have no indication of the size of this buffer.  If this
4015          allocation fail we have no indication that the state_log array
4016          does not have the right size.  */
4017       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4018                                               pstr->bufs_len + 1);
4019       if (BE (new_array == NULL, 0))
4020         return REG_ESPACE;
4021       mctx->state_log = new_array;
4022     }
4023
4024   /* Then reconstruct the buffers.  */
4025   if (pstr->icase)
4026     {
4027 #ifdef RE_ENABLE_I18N
4028       if (pstr->mb_cur_max > 1)
4029         {
4030           ret = build_wcs_upper_buffer (pstr);
4031           if (BE (ret != REG_NOERROR, 0))
4032             return ret;
4033         }
4034       else
4035 #endif /* RE_ENABLE_I18N  */
4036         build_upper_buffer (pstr);
4037     }
4038   else
4039     {
4040 #ifdef RE_ENABLE_I18N
4041       if (pstr->mb_cur_max > 1)
4042         build_wcs_buffer (pstr);
4043       else
4044 #endif /* RE_ENABLE_I18N  */
4045         {
4046           if (pstr->trans != NULL)
4047             re_string_translate_buffer (pstr);
4048         }
4049     }
4050   return REG_NOERROR;
4051 }
4052
4053 \f
4054 /* Functions for matching context.  */
4055
4056 /* Initialize MCTX.  */
4057
4058 static reg_errcode_t
4059 internal_function
4060 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4061 {
4062   mctx->eflags = eflags;
4063   mctx->match_last = -1;
4064   if (n > 0)
4065     {
4066       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4067       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4068       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4069         return REG_ESPACE;
4070     }
4071   /* Already zero-ed by the caller.
4072      else
4073        mctx->bkref_ents = NULL;
4074      mctx->nbkref_ents = 0;
4075      mctx->nsub_tops = 0;  */
4076   mctx->abkref_ents = n;
4077   mctx->max_mb_elem_len = 1;
4078   mctx->asub_tops = n;
4079   return REG_NOERROR;
4080 }
4081
4082 /* Clean the entries which depend on the current input in MCTX.
4083    This function must be invoked when the matcher changes the start index
4084    of the input, or changes the input string.  */
4085
4086 static void
4087 internal_function
4088 match_ctx_clean (re_match_context_t *mctx)
4089 {
4090   int st_idx;
4091   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4092     {
4093       int sl_idx;
4094       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4095       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4096         {
4097           re_sub_match_last_t *last = top->lasts[sl_idx];
4098           re_free (last->path.array);
4099           re_free (last);
4100         }
4101       re_free (top->lasts);
4102       if (top->path)
4103         {
4104           re_free (top->path->array);
4105           re_free (top->path);
4106         }
4107       free (top);
4108     }
4109
4110   mctx->nsub_tops = 0;
4111   mctx->nbkref_ents = 0;
4112 }
4113
4114 /* Free all the memory associated with MCTX.  */
4115
4116 static void
4117 internal_function
4118 match_ctx_free (re_match_context_t *mctx)
4119 {
4120   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4121   match_ctx_clean (mctx);
4122   re_free (mctx->sub_tops);
4123   re_free (mctx->bkref_ents);
4124 }
4125
4126 /* Add a new backreference entry to MCTX.
4127    Note that we assume that caller never call this function with duplicate
4128    entry, and call with STR_IDX which isn't smaller than any existing entry.
4129 */
4130
4131 static reg_errcode_t
4132 internal_function
4133 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx,
4134                      int from, int to)
4135 {
4136   if (mctx->nbkref_ents >= mctx->abkref_ents)
4137     {
4138       struct re_backref_cache_entry* new_entry;
4139       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4140                               mctx->abkref_ents * 2);
4141       if (BE (new_entry == NULL, 0))
4142         {
4143           re_free (mctx->bkref_ents);
4144           return REG_ESPACE;
4145         }
4146       mctx->bkref_ents = new_entry;
4147       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4148               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4149       mctx->abkref_ents *= 2;
4150     }
4151   if (mctx->nbkref_ents > 0
4152       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4153     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4154
4155   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4156   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4157   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4158   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4159
4160   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4161      If bit N is clear, means that this entry won't epsilon-transition to
4162      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4163      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4164      such node.
4165
4166      A backreference does not epsilon-transition unless it is empty, so set
4167      to all zeros if FROM != TO.  */
4168   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4169     = (from == to ? ~0 : 0);
4170
4171   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4172   if (mctx->max_mb_elem_len < to - from)
4173     mctx->max_mb_elem_len = to - from;
4174   return REG_NOERROR;
4175 }
4176
4177 /* Search for the first entry which has the same str_idx, or -1 if none is
4178    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4179
4180 static int
4181 internal_function
4182 search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
4183 {
4184   int left, right, mid, last;
4185   last = right = mctx->nbkref_ents;
4186   for (left = 0; left < right;)
4187     {
4188       mid = (left + right) / 2;
4189       if (mctx->bkref_ents[mid].str_idx < str_idx)
4190         left = mid + 1;
4191       else
4192         right = mid;
4193     }
4194   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4195     return left;
4196   else
4197     return -1;
4198 }
4199
4200 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4201    at STR_IDX.  */
4202
4203 static reg_errcode_t
4204 internal_function
4205 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4206 {
4207 #ifdef DEBUG
4208   assert (mctx->sub_tops != NULL);
4209   assert (mctx->asub_tops > 0);
4210 #endif
4211   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4212     {
4213       int new_asub_tops = mctx->asub_tops * 2;
4214       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4215                                                    re_sub_match_top_t *,
4216                                                    new_asub_tops);
4217       if (BE (new_array == NULL, 0))
4218         return REG_ESPACE;
4219       mctx->sub_tops = new_array;
4220       mctx->asub_tops = new_asub_tops;
4221     }
4222   mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4223   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4224     return REG_ESPACE;
4225   mctx->sub_tops[mctx->nsub_tops]->node = node;
4226   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4227   return REG_NOERROR;
4228 }
4229
4230 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4231    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4232
4233 static re_sub_match_last_t *
4234 internal_function
4235 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4236 {
4237   re_sub_match_last_t *new_entry;
4238   if (BE (subtop->nlasts == subtop->alasts, 0))
4239     {
4240       int new_alasts = 2 * subtop->alasts + 1;
4241       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4242                                                     re_sub_match_last_t *,
4243                                                     new_alasts);
4244       if (BE (new_array == NULL, 0))
4245         return NULL;
4246       subtop->lasts = new_array;
4247       subtop->alasts = new_alasts;
4248     }
4249   new_entry = re_calloc (re_sub_match_last_t, 1);
4250   if (BE (new_entry != NULL, 1))
4251     {
4252       subtop->lasts[subtop->nlasts] = new_entry;
4253       new_entry->node = node;
4254       new_entry->str_idx = str_idx;
4255       ++subtop->nlasts;
4256     }
4257   return new_entry;
4258 }
4259
4260 static void
4261 internal_function
4262 sift_ctx_init (re_sift_context_t *sctx,
4263                re_dfastate_t **sifted_sts,
4264                re_dfastate_t **limited_sts,
4265                int last_node, int last_str_idx)
4266 {
4267   sctx->sifted_states = sifted_sts;
4268   sctx->limited_states = limited_sts;
4269   sctx->last_node = last_node;
4270   sctx->last_str_idx = last_str_idx;
4271   re_node_set_init_empty (&sctx->limits);
4272 }