lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first)
  58      internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes) internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  74
  75 #ifdef RE_ENABLE_I18N
  76 static int sift_states_iter_mb (const re_match_context_t *mctx,
  77                                 re_sift_context_t *sctx,
  78                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  79 #endif /* RE_ENABLE_I18N */
  80 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  81                                            re_sift_context_t *sctx) internal_function;
  82 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  83                                           re_sift_context_t *sctx, int str_idx,
  84                                           re_node_set *cur_dest) internal_function;
  85 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  86                                               re_sift_context_t *sctx,
  87                                               int str_idx,
  88                                               re_node_set *dest_nodes) internal_function;
  89 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
  90                                             re_node_set *dest_nodes,
  91                                             const re_node_set *candidates) internal_function;
  92 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
  93                              int dst_node, int dst_idx, int src_node,
  94                              int src_idx) internal_function;
  95 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
  96                                         int boundaries, int subexp_idx,
  97                                         int from_node, int bkref_idx) internal_function;
  98 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
  99                                       int limit, int subexp_idx,
 100                                       int node, int str_idx,
 101                                       int bkref_idx) internal_function;
 102 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 103                                           re_node_set *dest_nodes,
 104                                           const re_node_set *candidates,
 105                                           re_node_set *limits,
 106                                           struct re_backref_cache_entry *bkref_ents,
 107                                           int str_idx) internal_function;
 108 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 109                                         re_sift_context_t *sctx,
 110                                         int str_idx, const re_node_set *candidates) internal_function;
 111 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 112                                         re_dfastate_t **src, int num) internal_function;
 113 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 114                                          re_match_context_t *mctx) internal_function;
 115 static re_dfastate_t *transit_state (reg_errcode_t *err,
 116                                      re_match_context_t *mctx,
 117                                      re_dfastate_t *state) internal_function;
 118 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 119                                             re_match_context_t *mctx,
 120                                             re_dfastate_t *next_state) internal_function;
 121 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 122                                                 re_node_set *cur_nodes,
 123                                                 int str_idx) internal_function;
 124 #if 0
 125 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 126                                         re_match_context_t *mctx,
 127                                         re_dfastate_t *pstate) internal_function;
 128 #endif
 129 #ifdef RE_ENABLE_I18N
 130 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 131                                        re_dfastate_t *pstate) internal_function;
 132 #endif /* RE_ENABLE_I18N */
 133 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 134                                           const re_node_set *nodes) internal_function;
 135 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 136                                  int bkref_node, int bkref_str_idx) internal_function;
 137 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 138                                      const re_sub_match_top_t *sub_top,
 139                                      re_sub_match_last_t *sub_last,
 140                                      int bkref_node, int bkref_str) internal_function;
 141 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 142                              int subexp_idx, int type) internal_function;
 143 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 144                                     state_array_t *path, int top_node,
 145                                     int top_str, int last_node, int last_str,
 146                                     int type) internal_function;
 147 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 148                                                    int str_idx,
 149                                                    re_node_set *cur_nodes,
 150                                                    re_node_set *next_nodes) internal_function;
 151 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 152                                                re_node_set *cur_nodes,
 153                                                int ex_subexp, int type) internal_function;
 154 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 155                                                    re_node_set *dst_nodes,
 156                                                    int target, int ex_subexp,
 157                                                    int type) internal_function;
 158 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 159                                          re_node_set *cur_nodes, int cur_str,
 160                                          int subexp_num, int type) internal_function;
 161 static int build_trtable (re_dfa_t *dfa,
 162                           re_dfastate_t *state) internal_function;
 163 #ifdef RE_ENABLE_I18N
 164 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 165                                     const re_string_t *input, int idx) internal_function;
 166 # ifdef _LIBC
 167 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 168                                                    size_t name_len) internal_function;
 169 # endif /* _LIBC */
 170 #endif /* RE_ENABLE_I18N */
 171 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 172                                        const re_dfastate_t *state,
 173                                        re_node_set *states_node,
 174                                        bitset *states_ch) internal_function;
 175 static int check_node_accept (const re_match_context_t *mctx,
 176                               const re_token_t *node, int idx) internal_function;
 177 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 178 \f
 179 /* Entry point for POSIX code.  */
 180
 181 /* regexec searches for a given pattern, specified by PREG, in the
 182    string STRING.
 183
 184    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 185    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 186    least NMATCH elements, and we set them to the offsets of the
 187    corresponding matched substrings.
 188
 189    EFLAGS specifies `execution flags' which affect matching: if
 190    REG_NOTBOL is set, then ^ does not match at the beginning of the
 191    string; if REG_NOTEOL is set, then $ does not match at the end.
 192
 193    We return 0 if we find a match and REG_NOMATCH if not.  */
 194
 195 int
 196 regexec (const regex_t *__restrict preg, const char *__restrict string,
 197          size_t nmatch, regmatch_t pmatch[], int eflags)
 198 {
 199   reg_errcode_t err;
 200   int start, length;
 201 #ifdef _LIBC
 202   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 203 #endif
 204
 205   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 206     return REG_BADPAT;
 207
 208   if (eflags & REG_STARTEND)
 209     {
 210       start = pmatch[0].rm_so;
 211       length = pmatch[0].rm_eo;
 212     }
 213   else
 214     {
 215       start = 0;
 216       length = strlen (string);
 217     }
 218
 219   __libc_lock_lock (dfa->lock);
 220   if (preg->re_no_sub)
 221     err = re_search_internal (preg, string, length, start, length - start,
 222                               length, 0, NULL, eflags);
 223   else
 224     err = re_search_internal (preg, string, length, start, length - start,
 225                               length, nmatch, pmatch, eflags);
 226   __libc_lock_unlock (dfa->lock);
 227   return err != REG_NOERROR;
 228 }
 229
 230 #ifdef _LIBC
 231 # include <shlib-compat.h>
 232 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 233
 234 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 235 __typeof__ (__regexec) __compat_regexec;
 236
 237 int
 238 attribute_compat_text_section
 239 __compat_regexec (const regex_t *__restrict preg,
 240                   const char *__restrict string, size_t nmatch,
 241                   regmatch_t pmatch[], int eflags)
 242 {
 243   return regexec (preg, string, nmatch, pmatch,
 244                   eflags & (REG_NOTBOL | REG_NOTEOL));
 245 }
 246 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 247 # endif
 248 #endif
 249
 250 /* Entry points for GNU code.  */
 251
 252 /* re_match, re_search, re_match_2, re_search_2
 253
 254    The former two functions operate on STRING with length LENGTH,
 255    while the later two operate on concatenation of STRING1 and STRING2
 256    with lengths LENGTH1 and LENGTH2, respectively.
 257
 258    re_match() matches the compiled pattern in BUFP against the string,
 259    starting at index START.
 260
 261    re_search() first tries matching at index START, then it tries to match
 262    starting from index START + 1, and so on.  The last start position tried
 263    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 264    way as re_match().)
 265
 266    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 267    the first STOP characters of the concatenation of the strings should be
 268    concerned.
 269
 270    If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
 271    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 272    computed relative to the concatenation, not relative to the individual
 273    strings.)
 274
 275    On success, re_match* functions return the length of the match, re_search*
 276    return the position of the start of the match.  Return value -1 means no
 277    match was found and -2 indicates an internal error.  */
 278
 279 int
 280 re_match (struct re_pattern_buffer *bufp, const char *string,
 281           int length, int start, struct re_registers *regs)
 282 {
 283   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_match, re_match)
 287 #endif
 288
 289 int
 290 re_search (struct re_pattern_buffer *bufp, const char *string,
 291            int length, int start, int range, struct re_registers *regs)
 292 {
 293   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 294 }
 295 #ifdef _LIBC
 296 weak_alias (__re_search, re_search)
 297 #endif
 298
 299 int
 300 re_match_2 (struct re_pattern_buffer *bufp,
 301             const char *string1, int length1,
 302             const char *string2, int length2,
 303             int start, struct re_registers *regs, int stop)
 304 {
 305   return re_search_2_stub (bufp, string1, length1, string2, length2,
 306                            start, 0, regs, stop, 1);
 307 }
 308 #ifdef _LIBC
 309 weak_alias (__re_match_2, re_match_2)
 310 #endif
 311
 312 int
 313 re_search_2 (struct re_pattern_buffer *bufp,
 314              const char *string1, int length1,
 315              const char *string2, int length2,
 316              int start, int range, struct re_registers *regs, int stop)
 317 {
 318   return re_search_2_stub (bufp, string1, length1, string2, length2,
 319                            start, range, regs, stop, 0);
 320 }
 321 #ifdef _LIBC
 322 weak_alias (__re_search_2, re_search_2)
 323 #endif
 324
 325 static int
 326 internal_function
 327 re_search_2_stub (struct re_pattern_buffer *bufp,
 328                   const char *string1, int length1,
 329                   const char *string2, int length2,
 330                   int start, int range, struct re_registers *regs, int stop,
 331                   int ret_len)
 332 {
 333   const char *str;
 334   int rval;
 335   int len = length1 + length2;
 336   int free_str = 0;
 337
 338   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 339     return -2;
 340
 341   /* Concatenate the strings.  */
 342   if (length2 > 0)
 343     if (length1 > 0)
 344       {
 345         char *s = re_malloc (char, len);
 346
 347         if (BE (s == NULL, 0))
 348           return -2;
 349         memcpy (s, string1, length1);
 350         memcpy (s + length1, string2, length2);
 351         str = s;
 352         free_str = 1;
 353       }
 354     else
 355       str = string2;
 356   else
 357     str = string1;
 358
 359   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 360                          ret_len);
 361   if (free_str)
 362     re_free ((char *) str);
 363   return rval;
 364 }
 365
 366 /* The parameters have the same meaning as those of re_search.
 367    Additional parameters:
 368    If RET_LEN is nonzero the length of the match is returned (re_match style);
 369    otherwise the position of the match is returned.  */
 370
 371 static int
 372 internal_function
 373 re_search_stub (struct re_pattern_buffer *bufp,
 374                 const char *string, int length,
 375                 int start, int range, int stop, struct re_registers *regs,
 376                 int ret_len)
 377 {
 378   reg_errcode_t result;
 379   regmatch_t *pmatch;
 380   int nregs, rval;
 381   int eflags = 0;
 382 #ifdef _LIBC
 383   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 384 #endif
 385
 386   /* Check for out-of-range.  */
 387   if (BE (start < 0 || start > length, 0))
 388     return -1;
 389   if (BE (start + range > length, 0))
 390     range = length - start;
 391   else if (BE (start + range < 0, 0))
 392     range = -start;
 393
 394   __libc_lock_lock (dfa->lock);
 395
 396   eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
 397   eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
 398
 399   /* Compile fastmap if we haven't yet.  */
 400   if (range > 0 && bufp->re_fastmap != NULL && !bufp->re_fastmap_accurate)
 401     re_compile_fastmap (bufp);
 402
 403   if (BE (bufp->re_no_sub, 0))
 404     regs = NULL;
 405
 406   /* We need at least 1 register.  */
 407   if (regs == NULL)
 408     nregs = 1;
 409   else if (BE (bufp->re_regs_allocated == REG_FIXED
 410                && regs->rm_num_regs < bufp->re_nsub + 1, 0))
 411     {
 412       nregs = regs->rm_num_regs;
 413       if (BE (nregs < 1, 0))
 414         {
 415           /* Nothing can be copied to regs.  */
 416           regs = NULL;
 417           nregs = 1;
 418         }
 419     }
 420   else
 421     nregs = bufp->re_nsub + 1;
 422   pmatch = re_malloc (regmatch_t, nregs);
 423   if (BE (pmatch == NULL, 0))
 424     {
 425       rval = -2;
 426       goto out;
 427     }
 428
 429   result = re_search_internal (bufp, string, length, start, range, stop,
 430                                nregs, pmatch, eflags);
 431
 432   rval = 0;
 433
 434   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 435   if (result != REG_NOERROR)
 436     rval = -1;
 437   else if (regs != NULL)
 438     {
 439       /* If caller wants register contents data back, copy them.  */
 440       bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
 441                                               bufp->re_regs_allocated);
 442       if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
 443         rval = -2;
 444     }
 445
 446   if (BE (rval == 0, 1))
 447     {
 448       if (ret_len)
 449         {
 450           assert (pmatch[0].rm_so == start);
 451           rval = pmatch[0].rm_eo - start;
 452         }
 453       else
 454         rval = pmatch[0].rm_so;
 455     }
 456   re_free (pmatch);
 457  out:
 458   __libc_lock_unlock (dfa->lock);
 459   return rval;
 460 }
 461
 462 static unsigned
 463 internal_function
 464 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
 465               int regs_allocated)
 466 {
 467   int rval = REG_REALLOCATE;
 468   int i;
 469   int need_regs = nregs + 1;
 470   /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
 471      uses.  */
 472
 473   /* Have the register data arrays been allocated?  */
 474   if (regs_allocated == REG_UNALLOCATED)
 475     { /* No.  So allocate them with malloc.  */
 476       regs->rm_start = re_malloc (regoff_t, need_regs);
 477       regs->rm_end = re_malloc (regoff_t, need_regs);
 478       if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
 479         return REG_UNALLOCATED;
 480       regs->rm_num_regs = need_regs;
 481     }
 482   else if (regs_allocated == REG_REALLOCATE)
 483     { /* Yes.  If we need more elements than were already
 484          allocated, reallocate them.  If we need fewer, just
 485          leave it alone.  */
 486       if (BE (need_regs > regs->rm_num_regs, 0))
 487         {
 488           regoff_t *new_start =
 489             re_realloc (regs->rm_start, regoff_t, need_regs);
 490           regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
 491           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 492             return REG_UNALLOCATED;
 493           regs->rm_start = new_start;
 494           regs->rm_end = new_end;
 495           regs->rm_num_regs = need_regs;
 496         }
 497     }
 498   else
 499     {
 500       assert (regs_allocated == REG_FIXED);
 501       /* This function may not be called with REG_FIXED and nregs too big.  */
 502       assert (regs->rm_num_regs >= nregs);
 503       rval = REG_FIXED;
 504     }
 505
 506   /* Copy the regs.  */
 507   for (i = 0; i < nregs; ++i)
 508     {
 509       regs->rm_start[i] = pmatch[i].rm_so;
 510       regs->rm_end[i] = pmatch[i].rm_eo;
 511     }
 512   for ( ; i < regs->rm_num_regs; ++i)
 513     regs->rm_start[i] = regs->rm_end[i] = -1;
 514
 515   return rval;
 516 }
 517
 518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 519    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 520    this memory for recording register information.  STARTS and ENDS
 521    must be allocated using the malloc library routine, and must each
 522    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 523
 524    If NUM_REGS == 0, then subsequent matches should allocate their own
 525    register data.
 526
 527    Unless this function is called, the first search or match using
 528    PATTERN_BUFFER will allocate its own register data, without
 529    freeing the old data.  */
 530
 531 void
 532 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 533                   unsigned int num_regs, regoff_t *starts, regoff_t *ends)
 534 {
 535   if (num_regs)
 536     {
 537       bufp->re_regs_allocated = REG_REALLOCATE;
 538       regs->rm_num_regs = num_regs;
 539       regs->rm_start = starts;
 540       regs->rm_end = ends;
 541     }
 542   else
 543     {
 544       bufp->re_regs_allocated = REG_UNALLOCATED;
 545       regs->rm_num_regs = 0;
 546       regs->rm_start = regs->rm_end = NULL;
 547     }
 548 }
 549 #ifdef _LIBC
 550 weak_alias (__re_set_registers, re_set_registers)
 551 #endif
 552 \f
 553 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 554    them unless specifically requested.  */
 555
 556 #if defined _REGEX_RE_COMP || defined _LIBC
 557 int
 558 # ifdef _LIBC
 559 weak_function
 560 # endif
 561 re_exec (s)
 562      const char *s;
 563 {
 564   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 565 }
 566 #endif /* _REGEX_RE_COMP */
 567 \f
 568 /* Internal entry point.  */
 569
 570 /* Searches for a compiled pattern PREG in the string STRING, whose
 571    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 572    mingings with regexec.  START, and RANGE have the same meanings
 573    with re_search.
 574    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 575    otherwise return the error code.
 576    Note: We assume front end functions already check ranges.
 577    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 578
 579 static reg_errcode_t
 580 internal_function
 581 re_search_internal (const regex_t *preg,
 582                     const char *string, int length,
 583                     int start, int range, int stop,
 584                     size_t nmatch, regmatch_t pmatch[],
 585                     int eflags)
 586 {
 587   reg_errcode_t err;
 588   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 589   int left_lim, right_lim, incr;
 590   int fl_longest_match, match_first, match_kind, match_last = -1;
 591   int extra_nmatch;
 592   int sb, ch;
 593 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 594   re_match_context_t mctx = { .dfa = dfa };
 595 #else
 596   re_match_context_t mctx;
 597 #endif
 598   char *fastmap = (preg->re_fastmap != NULL && preg->re_fastmap_accurate
 599                    && range && !preg->re_can_be_null) ? preg->re_fastmap : NULL;
 600   unsigned REG_TRANSLATE_TYPE t =
 601     (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
 602
 603 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 604   memset (&mctx, '\0', sizeof (re_match_context_t));
 605   mctx.dfa = dfa;
 606 #endif
 607
 608   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 609   nmatch -= extra_nmatch;
 610
 611   /* Check if the DFA haven't been compiled.  */
 612   if (BE (preg->re_used == 0 || dfa->init_state == NULL
 613           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 614           || dfa->init_state_begbuf == NULL, 0))
 615     return REG_NOMATCH;
 616
 617 #ifdef DEBUG
 618   /* We assume front-end functions already check them.  */
 619   assert (start + range >= 0 && start + range <= length);
 620 #endif
 621
 622   /* If initial states with non-begbuf contexts have no elements,
 623      the regex must be anchored.  If preg->re_newline_anchor is set,
 624      we'll never use init_state_nl, so do not check it.  */
 625   if (dfa->init_state->nodes.nelem == 0
 626       && dfa->init_state_word->nodes.nelem == 0
 627       && (dfa->init_state_nl->nodes.nelem == 0
 628           || !preg->re_newline_anchor))
 629     {
 630       if (start != 0 && start + range != 0)
 631         return REG_NOMATCH;
 632       start = range = 0;
 633     }
 634
 635   /* We must check the longest matching, if nmatch > 0.  */
 636   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 637
 638   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 639                             preg->re_translate,
 640                             preg->re_syntax & REG_IGNORE_CASE, dfa);
 641   if (BE (err != REG_NOERROR, 0))
 642     goto free_return;
 643   mctx.input.stop = stop;
 644   mctx.input.raw_stop = stop;
 645   mctx.input.newline_anchor = preg->re_newline_anchor;
 646
 647   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 648   if (BE (err != REG_NOERROR, 0))
 649     goto free_return;
 650
 651   /* We will log all the DFA states through which the dfa pass,
 652      if nmatch > 1, or this dfa has "multibyte node", which is a
 653      back-reference or a node which can accept multibyte character or
 654      multi character collating element.  */
 655   if (nmatch > 1 || dfa->has_mb_node)
 656     {
 657       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 658       if (BE (mctx.state_log == NULL, 0))
 659         {
 660           err = REG_ESPACE;
 661           goto free_return;
 662         }
 663     }
 664   else
 665     mctx.state_log = NULL;
 666
 667   match_first = start;
 668   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 669                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 670
 671   /* Check incrementally whether of not the input string match.  */
 672   incr = (range < 0) ? -1 : 1;
 673   left_lim = (range < 0) ? start + range : start;
 674   right_lim = (range < 0) ? start : start + range;
 675   sb = dfa->mb_cur_max == 1;
 676   match_kind =
 677     (fastmap
 678      ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
 679         | (range >= 0 ? 2 : 0)
 680         | (t != NULL ? 1 : 0))
 681      : 8);
 682
 683   for (;; match_first += incr)
 684     {
 685       err = REG_NOMATCH;
 686       if (match_first < left_lim || right_lim < match_first)
 687         goto free_return;
 688
 689       /* Advance as rapidly as possible through the string, until we
 690          find a plausible place to start matching.  This may be done
 691          with varying efficiency, so there are various possibilities:
 692          only the most common of them are specialized, in order to
 693          save on code size.  We use a switch statement for speed.  */
 694       switch (match_kind)
 695         {
 696         case 8:
 697           /* No fastmap.  */
 698           break;
 699
 700         case 7:
 701           /* Fastmap with single-byte translation, match forward.  */
 702           while (BE (match_first < right_lim, 1)
 703                  && !fastmap[t[(unsigned char) string[match_first]]])
 704             ++match_first;
 705           goto forward_match_found_start_or_reached_end;
 706
 707         case 6:
 708           /* Fastmap without translation, match forward.  */
 709           while (BE (match_first < right_lim, 1)
 710                  && !fastmap[(unsigned char) string[match_first]])
 711             ++match_first;
 712
 713         forward_match_found_start_or_reached_end:
 714           if (BE (match_first == right_lim, 0))
 715             {
 716               ch = match_first >= length
 717                        ? 0 : (unsigned char) string[match_first];
 718               if (!fastmap[t ? t[ch] : ch])
 719                 goto free_return;
 720             }
 721           break;
 722
 723         case 4:
 724         case 5:
 725           /* Fastmap without multi-byte translation, match backwards.  */
 726           while (match_first >= left_lim)
 727             {
 728               ch = match_first >= length
 729                        ? 0 : (unsigned char) string[match_first];
 730               if (fastmap[t ? t[ch] : ch])
 731                 break;
 732               --match_first;
 733             }
 734           if (match_first < left_lim)
 735             goto free_return;
 736           break;
 737
 738         default:
 739           /* In this case, we can't determine easily the current byte,
 740              since it might be a component byte of a multibyte
 741              character.  Then we use the constructed buffer instead.  */
 742           for (;;)
 743             {
 744               /* If MATCH_FIRST is out of the valid range, reconstruct the
 745                  buffers.  */
 746               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 747               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 748                 {
 749                   err = re_string_reconstruct (&mctx.input, match_first,
 750                                                eflags);
 751                   if (BE (err != REG_NOERROR, 0))
 752                     goto free_return;
 753
 754                   offset = match_first - mctx.input.raw_mbs_idx;
 755                 }
 756               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 757                  Note that MATCH_FIRST must not be smaller than 0.  */
 758               ch = (match_first >= length
 759                     ? 0 : re_string_byte_at (&mctx.input, offset));
 760               if (fastmap[ch])
 761                 break;
 762               match_first += incr;
 763               if (match_first < left_lim || match_first > right_lim)
 764                 {
 765                   err = REG_NOMATCH;
 766                   goto free_return;
 767                 }
 768             }
 769           break;
 770         }
 771
 772       /* Reconstruct the buffers so that the matcher can assume that
 773          the matching starts from the beginning of the buffer.  */
 774       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 775       if (BE (err != REG_NOERROR, 0))
 776         goto free_return;
 777
 778 #ifdef RE_ENABLE_I18N
 779      /* Don't consider this char as a possible match start if it part,
 780         yet isn't the head, of a multibyte character.  */
 781       if (!sb && !re_string_first_byte (&mctx.input, 0))
 782         continue;
 783 #endif
 784
 785       /* It seems to be appropriate one, then use the matcher.  */
 786       /* We assume that the matching starts from 0.  */
 787       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 788       match_last = check_matching (&mctx, fl_longest_match,
 789                                    range >= 0 ? &match_first : NULL);
 790       if (match_last != -1)
 791         {
 792           if (BE (match_last == -2, 0))
 793             {
 794               err = REG_ESPACE;
 795               goto free_return;
 796             }
 797           else
 798             {
 799               mctx.match_last = match_last;
 800               if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
 801                 {
 802                   re_dfastate_t *pstate = mctx.state_log[match_last];
 803                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 804                                                              match_last);
 805                 }
 806               if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
 807                   || dfa->nbackref)
 808                 {
 809                   err = prune_impossible_nodes (&mctx);
 810                   if (err == REG_NOERROR)
 811                     break;
 812                   if (BE (err != REG_NOMATCH, 0))
 813                     goto free_return;
 814                   match_last = -1;
 815                 }
 816               else
 817                 break; /* We found a match.  */
 818             }
 819         }
 820
 821       match_ctx_clean (&mctx);
 822     }
 823
 824 #ifdef DEBUG
 825   assert (match_last != -1);
 826   assert (err == REG_NOERROR);
 827 #endif
 828
 829   /* Set pmatch[] if we need.  */
 830   if (nmatch > 0)
 831     {
 832       int reg_idx;
 833
 834       /* Initialize registers.  */
 835       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 836         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 837
 838       /* Set the points where matching start/end.  */
 839       pmatch[0].rm_so = 0;
 840       pmatch[0].rm_eo = mctx.match_last;
 841
 842       if (!preg->re_no_sub && nmatch > 1)
 843         {
 844           err = set_regs (preg, &mctx, nmatch, pmatch,
 845                           dfa->has_plural_match && dfa->nbackref > 0);
 846           if (BE (err != REG_NOERROR, 0))
 847             goto free_return;
 848         }
 849
 850       /* At last, add the offset to the each registers, since we slided
 851          the buffers so that we could assume that the matching starts
 852          from 0.  */
 853       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 854         if (pmatch[reg_idx].rm_so != -1)
 855           {
 856 #ifdef RE_ENABLE_I18N
 857             if (BE (mctx.input.offsets_needed != 0, 0))
 858               {
 859                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 860                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 861                 else
 862                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 863                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 864                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 865                 else
 866                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 867               }
 868 #else
 869             assert (mctx.input.offsets_needed == 0);
 870 #endif
 871             pmatch[reg_idx].rm_so += match_first;
 872             pmatch[reg_idx].rm_eo += match_first;
 873           }
 874       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 875         {
 876           pmatch[nmatch + reg_idx].rm_so = -1;
 877           pmatch[nmatch + reg_idx].rm_eo = -1;
 878         }
 879
 880       if (dfa->subexp_map)
 881         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 882           if (dfa->subexp_map[reg_idx] != reg_idx)
 883             {
 884               pmatch[reg_idx + 1].rm_so
 885                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 886               pmatch[reg_idx + 1].rm_eo
 887                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 888             }
 889     }
 890
 891  free_return:
 892   re_free (mctx.state_log);
 893   if (dfa->nbackref)
 894     match_ctx_free (&mctx);
 895   re_string_destruct (&mctx.input);
 896   return err;
 897 }
 898
 899 static reg_errcode_t
 900 internal_function
 901 prune_impossible_nodes (re_match_context_t *mctx)
 902 {
 903   re_dfa_t *const dfa = mctx->dfa;
 904   int halt_node, match_last;
 905   reg_errcode_t ret;
 906   re_dfastate_t **sifted_states;
 907   re_dfastate_t **lim_states = NULL;
 908   re_sift_context_t sctx;
 909 #ifdef DEBUG
 910   assert (mctx->state_log != NULL);
 911 #endif
 912   match_last = mctx->match_last;
 913   halt_node = mctx->last_node;
 914   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 915   if (BE (sifted_states == NULL, 0))
 916     {
 917       ret = REG_ESPACE;
 918       goto free_return;
 919     }
 920   if (dfa->nbackref)
 921     {
 922       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 923       if (BE (lim_states == NULL, 0))
 924         {
 925           ret = REG_ESPACE;
 926           goto free_return;
 927         }
 928       while (1)
 929         {
 930           memset (lim_states, '\0',
 931                   sizeof (re_dfastate_t *) * (match_last + 1));
 932           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 933                          match_last);
 934           ret = sift_states_backward (mctx, &sctx);
 935           re_node_set_free (&sctx.limits);
 936           if (BE (ret != REG_NOERROR, 0))
 937               goto free_return;
 938           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 939             break;
 940           do
 941             {
 942               --match_last;
 943               if (match_last < 0)
 944                 {
 945                   ret = REG_NOMATCH;
 946                   goto free_return;
 947                 }
 948             } while (mctx->state_log[match_last] == NULL
 949                      || !mctx->state_log[match_last]->halt);
 950           halt_node = check_halt_state_context (mctx,
 951                                                 mctx->state_log[match_last],
 952                                                 match_last);
 953         }
 954       ret = merge_state_array (dfa, sifted_states, lim_states,
 955                                match_last + 1);
 956       re_free (lim_states);
 957       lim_states = NULL;
 958       if (BE (ret != REG_NOERROR, 0))
 959         goto free_return;
 960     }
 961   else
 962     {
 963       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 964       ret = sift_states_backward (mctx, &sctx);
 965       re_node_set_free (&sctx.limits);
 966       if (BE (ret != REG_NOERROR, 0))
 967         goto free_return;
 968     }
 969   re_free (mctx->state_log);
 970   mctx->state_log = sifted_states;
 971   sifted_states = NULL;
 972   mctx->last_node = halt_node;
 973   mctx->match_last = match_last;
 974   ret = REG_NOERROR;
 975  free_return:
 976   re_free (sifted_states);
 977   re_free (lim_states);
 978   return ret;
 979 }
 980
 981 /* Acquire an initial state and return it.
 982    We must select appropriate initial state depending on the context,
 983    since initial states may have constraints like "\<", "^", etc..  */
 984
 985 static inline re_dfastate_t *
 986 __attribute ((always_inline)) internal_function
 987 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
 988                             int idx)
 989 {
 990   re_dfa_t *const dfa = mctx->dfa;
 991   if (dfa->init_state->has_constraint)
 992     {
 993       unsigned int context;
 994       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 995       if (IS_WORD_CONTEXT (context))
 996         return dfa->init_state_word;
 997       else if (IS_ORDINARY_CONTEXT (context))
 998         return dfa->init_state;
 999       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1000         return dfa->init_state_begbuf;
1001       else if (IS_NEWLINE_CONTEXT (context))
1002         return dfa->init_state_nl;
1003       else if (IS_BEGBUF_CONTEXT (context))
1004         {
1005           /* It is relatively rare case, then calculate on demand.  */
1006           return re_acquire_state_context (err, dfa,
1007                                            dfa->init_state->entrance_nodes,
1008                                            context);
1009         }
1010       else
1011         /* Must not happen?  */
1012         return dfa->init_state;
1013     }
1014   else
1015     return dfa->init_state;
1016 }
1017
1018 /* Check whether the regular expression match input string INPUT or not,
1019    and return the index where the matching end, return -1 if not match,
1020    or return -2 in case of an error.
1021    FL_LONGEST_MATCH means we want the POSIX longest matching.
1022    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1023    next place where we may want to try matching.
1024    Note that the matcher assume that the maching starts from the current
1025    index of the buffer.  */
1026
1027 static int
1028 internal_function
1029 check_matching (re_match_context_t *mctx, int fl_longest_match,
1030                 int *p_match_first)
1031 {
1032   re_dfa_t *const dfa = mctx->dfa;
1033   reg_errcode_t err;
1034   int match = 0;
1035   int match_last = -1;
1036   int cur_str_idx = re_string_cur_idx (&mctx->input);
1037   re_dfastate_t *cur_state;
1038   int at_init_state = p_match_first != NULL;
1039   int next_start_idx = cur_str_idx;
1040
1041   err = REG_NOERROR;
1042   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1043   /* An initial state must not be NULL (invalid).  */
1044   if (BE (cur_state == NULL, 0))
1045     {
1046       assert (err == REG_ESPACE);
1047       return -2;
1048     }
1049
1050   if (mctx->state_log != NULL)
1051     {
1052       mctx->state_log[cur_str_idx] = cur_state;
1053
1054       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1055          later.  E.g. Processing back references.  */
1056       if (BE (dfa->nbackref, 0))
1057         {
1058           at_init_state = 0;
1059           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1060           if (BE (err != REG_NOERROR, 0))
1061             return err;
1062
1063           if (cur_state->has_backref)
1064             {
1065               err = transit_state_bkref (mctx, &cur_state->nodes);
1066               if (BE (err != REG_NOERROR, 0))
1067                 return err;
1068             }
1069         }
1070     }
1071
1072   /* If the RE accepts NULL string.  */
1073   if (BE (cur_state->halt, 0))
1074     {
1075       if (!cur_state->has_constraint
1076           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1077         {
1078           if (!fl_longest_match)
1079             return cur_str_idx;
1080           else
1081             {
1082               match_last = cur_str_idx;
1083               match = 1;
1084             }
1085         }
1086     }
1087
1088   while (!re_string_eoi (&mctx->input))
1089     {
1090       re_dfastate_t *old_state = cur_state;
1091       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1092
1093       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1094           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1095               && mctx->input.valid_len < mctx->input.len))
1096         {
1097           err = extend_buffers (mctx);
1098           if (BE (err != REG_NOERROR, 0))
1099             {
1100               assert (err == REG_ESPACE);
1101               return -2;
1102             }
1103         }
1104
1105       cur_state = transit_state (&err, mctx, cur_state);
1106       if (mctx->state_log != NULL)
1107         cur_state = merge_state_with_log (&err, mctx, cur_state);
1108
1109       if (cur_state == NULL)
1110         {
1111           /* Reached the invalid state or an error.  Try to recover a valid
1112              state using the state log, if available and if we have not
1113              already found a valid (even if not the longest) match.  */
1114           if (BE (err != REG_NOERROR, 0))
1115             return -2;
1116
1117           if (mctx->state_log == NULL
1118               || (match && !fl_longest_match)
1119               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1120             break;
1121         }
1122
1123       if (BE (at_init_state, 0))
1124         {
1125           if (old_state == cur_state)
1126             next_start_idx = next_char_idx;
1127           else
1128             at_init_state = 0;
1129         }
1130
1131       if (cur_state->halt)
1132         {
1133           /* Reached a halt state.
1134              Check the halt state can satisfy the current context.  */
1135           if (!cur_state->has_constraint
1136               || check_halt_state_context (mctx, cur_state,
1137                                            re_string_cur_idx (&mctx->input)))
1138             {
1139               /* We found an appropriate halt state.  */
1140               match_last = re_string_cur_idx (&mctx->input);
1141               match = 1;
1142
1143               /* We found a match, do not modify match_first below.  */
1144               p_match_first = NULL;
1145               if (!fl_longest_match)
1146                 break;
1147             }
1148         }
1149     }
1150
1151   if (p_match_first)
1152     *p_match_first += next_start_idx;
1153
1154   return match_last;
1155 }
1156
1157 /* Check NODE match the current context.  */
1158
1159 static int
1160 internal_function
1161 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1162 {
1163   re_token_type_t type = dfa->nodes[node].type;
1164   unsigned int constraint = dfa->nodes[node].constraint;
1165   if (type != END_OF_RE)
1166     return 0;
1167   if (!constraint)
1168     return 1;
1169   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1170     return 0;
1171   return 1;
1172 }
1173
1174 /* Check the halt state STATE match the current context.
1175    Return 0 if not match, if the node, STATE has, is a halt node and
1176    match the context, return the node.  */
1177
1178 static int
1179 internal_function
1180 check_halt_state_context (const re_match_context_t *mctx,
1181                           const re_dfastate_t *state, int idx)
1182 {
1183   int i;
1184   unsigned int context;
1185 #ifdef DEBUG
1186   assert (state->halt);
1187 #endif
1188   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1189   for (i = 0; i < state->nodes.nelem; ++i)
1190     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1191       return state->nodes.elems[i];
1192   return 0;
1193 }
1194
1195 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1196    corresponding to the DFA).
1197    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1198    of errors.  */
1199
1200 static int
1201 internal_function
1202 proceed_next_node (const re_match_context_t *mctx,
1203                    int nregs, regmatch_t *regs, int *pidx, int node,
1204                    re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1205 {
1206   re_dfa_t *const dfa = mctx->dfa;
1207   int i, err;
1208   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1209     {
1210       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1211       re_node_set *edests = &dfa->edests[node];
1212       int dest_node;
1213       err = re_node_set_insert (eps_via_nodes, node);
1214       if (BE (err < 0, 0))
1215         return -2;
1216       /* Pick up a valid destination, or return -1 if none is found.  */
1217       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1218         {
1219           int candidate = edests->elems[i];
1220           if (!re_node_set_contains (cur_nodes, candidate))
1221             continue;
1222           if (dest_node == -1)
1223             dest_node = candidate;
1224
1225           else
1226             {
1227               /* In order to avoid infinite loop like "(a*)*", return the second
1228                  epsilon-transition if the first was already considered.  */
1229               if (re_node_set_contains (eps_via_nodes, dest_node))
1230                 return candidate;
1231
1232               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1233               else if (fs != NULL
1234                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1235                                            eps_via_nodes))
1236                 return -2;
1237
1238               /* We know we are going to exit.  */
1239               break;
1240             }
1241         }
1242       return dest_node;
1243     }
1244   else
1245     {
1246       int naccepted = 0;
1247       re_token_type_t type = dfa->nodes[node].type;
1248
1249 #ifdef RE_ENABLE_I18N
1250       if (dfa->nodes[node].accept_mb)
1251         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1252       else
1253 #endif /* RE_ENABLE_I18N */
1254       if (type == OP_BACK_REF)
1255         {
1256           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1257           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1258           if (fs != NULL)
1259             {
1260               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1261                 return -1;
1262               else if (naccepted)
1263                 {
1264                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1265                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1266                               naccepted) != 0)
1267                     return -1;
1268                 }
1269             }
1270
1271           if (naccepted == 0)
1272             {
1273               int dest_node;
1274               err = re_node_set_insert (eps_via_nodes, node);
1275               if (BE (err < 0, 0))
1276                 return -2;
1277               dest_node = dfa->edests[node].elems[0];
1278               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1279                                         dest_node))
1280                 return dest_node;
1281             }
1282         }
1283
1284       if (naccepted != 0
1285           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1286         {
1287           int dest_node = dfa->nexts[node];
1288           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1289           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1290                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1291                                                dest_node)))
1292             return -1;
1293           re_node_set_empty (eps_via_nodes);
1294           return dest_node;
1295         }
1296     }
1297   return -1;
1298 }
1299
1300 static reg_errcode_t
1301 internal_function
1302 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1303                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1304 {
1305   reg_errcode_t err;
1306   int num = fs->num++;
1307   if (fs->num == fs->alloc)
1308     {
1309       struct re_fail_stack_ent_t *new_array =
1310         re_realloc (fs->stack, struct re_fail_stack_ent_t, fs->alloc * 2);
1311       if (new_array == NULL)
1312         return REG_ESPACE;
1313       fs->alloc *= 2;
1314       fs->stack = new_array;
1315     }
1316   fs->stack[num].idx = str_idx;
1317   fs->stack[num].node = dest_node;
1318   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1319   if (fs->stack[num].regs == NULL)
1320     return REG_ESPACE;
1321   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1322   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1323   return err;
1324 }
1325
1326 static int
1327 internal_function
1328 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx,
1329                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1330 {
1331   int num = --fs->num;
1332   assert (num >= 0);
1333   *pidx = fs->stack[num].idx;
1334   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1335   re_node_set_free (eps_via_nodes);
1336   re_free (fs->stack[num].regs);
1337   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1338   return fs->stack[num].node;
1339 }
1340
1341 /* Set the positions where the subexpressions are starts/ends to registers
1342    PMATCH.
1343    Note: We assume that pmatch[0] is already set, and
1344    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1345
1346 static reg_errcode_t
1347 internal_function
1348 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1349           size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1350 {
1351   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1352   int idx, cur_node;
1353   re_node_set eps_via_nodes;
1354   struct re_fail_stack_t *fs;
1355   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1356   regmatch_t *prev_idx_match;
1357
1358 #ifdef DEBUG
1359   assert (nmatch > 1);
1360   assert (mctx->state_log != NULL);
1361 #endif
1362   if (fl_backtrack)
1363     {
1364       fs = &fs_body;
1365       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1366       if (fs->stack == NULL)
1367         return REG_ESPACE;
1368     }
1369   else
1370     fs = NULL;
1371
1372   cur_node = dfa->init_node;
1373   re_node_set_init_empty (&eps_via_nodes);
1374
1375   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1376   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1377
1378   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1379     {
1380       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1381
1382       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1383         {
1384           int reg_idx;
1385           if (fs)
1386             {
1387               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1388                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1389                   break;
1390               if (reg_idx == nmatch)
1391                 {
1392                   re_node_set_free (&eps_via_nodes);
1393                   return free_fail_stack_return (fs);
1394                 }
1395               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1396                                          &eps_via_nodes);
1397             }
1398           else
1399             {
1400               re_node_set_free (&eps_via_nodes);
1401               return REG_NOERROR;
1402             }
1403         }
1404
1405       /* Proceed to next node.  */
1406       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1407                                     &eps_via_nodes, fs);
1408
1409       if (BE (cur_node < 0, 0))
1410         {
1411           if (BE (cur_node == -2, 0))
1412             {
1413               re_node_set_free (&eps_via_nodes);
1414               free_fail_stack_return (fs);
1415               return REG_ESPACE;
1416             }
1417           if (fs)
1418             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1419                                        &eps_via_nodes);
1420           else
1421             {
1422               re_node_set_free (&eps_via_nodes);
1423               return REG_NOMATCH;
1424             }
1425         }
1426     }
1427   re_node_set_free (&eps_via_nodes);
1428   return free_fail_stack_return (fs);
1429 }
1430
1431 static reg_errcode_t
1432 internal_function
1433 free_fail_stack_return (struct re_fail_stack_t *fs)
1434 {
1435   if (fs)
1436     {
1437       int fs_idx;
1438       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1439         {
1440           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1441           re_free (fs->stack[fs_idx].regs);
1442         }
1443       re_free (fs->stack);
1444     }
1445   return REG_NOERROR;
1446 }
1447
1448 static void
1449 internal_function
1450 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1451              int cur_node, int cur_idx, int nmatch)
1452 {
1453   int type = dfa->nodes[cur_node].type;
1454   if (type == OP_OPEN_SUBEXP)
1455     {
1456       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1457
1458       /* We are at the first node of this sub expression.  */
1459       if (reg_num < nmatch)
1460         {
1461           pmatch[reg_num].rm_so = cur_idx;
1462           pmatch[reg_num].rm_eo = -1;
1463         }
1464     }
1465   else if (type == OP_CLOSE_SUBEXP)
1466     {
1467       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1468       if (reg_num < nmatch)
1469         {
1470           /* We are at the last node of this sub expression.  */
1471           if (pmatch[reg_num].rm_so < cur_idx)
1472             {
1473               pmatch[reg_num].rm_eo = cur_idx;
1474               /* This is a non-empty match or we are not inside an optional
1475                  subexpression.  Accept this right away.  */
1476               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1477             }
1478           else
1479             {
1480               if (dfa->nodes[cur_node].opt_subexp
1481                   && prev_idx_match[reg_num].rm_so != -1)
1482                 /* We transited through an empty match for an optional
1483                    subexpression, like (a?)*, and this is not the subexp's
1484                    first match.  Copy back the old content of the registers
1485                    so that matches of an inner subexpression are undone as
1486                    well, like in ((a?))*.  */
1487                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1488               else
1489                 /* We completed a subexpression, but it may be part of
1490                    an optional one, so do not update PREV_IDX_MATCH.  */
1491                 pmatch[reg_num].rm_eo = cur_idx;
1492             }
1493         }
1494     }
1495 }
1496
1497 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1498    and sift the nodes in each states according to the following rules.
1499    Updated state_log will be wrote to STATE_LOG.
1500
1501    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1502      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1503         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1504         the LAST_NODE, we throw away the node `a'.
1505      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1506         string `s' and transit to `b':
1507         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1508            away the node `a'.
1509         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1510             thrown away, we throw away the node `a'.
1511      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1512         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1513            node `a'.
1514         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1515             we throw away the node `a'.  */
1516
1517 #define STATE_NODE_CONTAINS(state,node) \
1518   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1519
1520 static reg_errcode_t
1521 internal_function
1522 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1523 {
1524   reg_errcode_t err;
1525   int null_cnt = 0;
1526   int str_idx = sctx->last_str_idx;
1527   re_node_set cur_dest;
1528
1529 #ifdef DEBUG
1530   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1531 #endif
1532
1533   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1534      transit to the last_node and the last_node itself.  */
1535   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1536   if (BE (err != REG_NOERROR, 0))
1537     return err;
1538   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1539   if (BE (err != REG_NOERROR, 0))
1540     goto free_return;
1541
1542   /* Then check each states in the state_log.  */
1543   while (str_idx > 0)
1544     {
1545       /* Update counters.  */
1546       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1547       if (null_cnt > mctx->max_mb_elem_len)
1548         {
1549           memset (sctx->sifted_states, '\0',
1550                   sizeof (re_dfastate_t *) * str_idx);
1551           re_node_set_free (&cur_dest);
1552           return REG_NOERROR;
1553         }
1554       re_node_set_empty (&cur_dest);
1555       --str_idx;
1556
1557       if (mctx->state_log[str_idx])
1558         {
1559           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1560           if (BE (err != REG_NOERROR, 0))
1561             goto free_return;
1562         }
1563
1564       /* Add all the nodes which satisfy the following conditions:
1565          - It can epsilon transit to a node in CUR_DEST.
1566          - It is in CUR_SRC.
1567          And update state_log.  */
1568       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1569       if (BE (err != REG_NOERROR, 0))
1570         goto free_return;
1571     }
1572   err = REG_NOERROR;
1573  free_return:
1574   re_node_set_free (&cur_dest);
1575   return err;
1576 }
1577
1578 static reg_errcode_t
1579 internal_function
1580 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1581                      int str_idx, re_node_set *cur_dest)
1582 {
1583   re_dfa_t *const dfa = mctx->dfa;
1584   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1585   int i;
1586
1587   /* Then build the next sifted state.
1588      We build the next sifted state on `cur_dest', and update
1589      `sifted_states[str_idx]' with `cur_dest'.
1590      Note:
1591      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1592      `cur_src' points the node_set of the old `state_log[str_idx]'
1593      (with the epsilon nodes pre-filtered out).  */
1594   for (i = 0; i < cur_src->nelem; i++)
1595     {
1596       int prev_node = cur_src->elems[i];
1597       int naccepted = 0;
1598       int ret;
1599
1600 #ifdef DEBUG
1601       re_token_type_t type = dfa->nodes[prev_node].type;
1602       assert (!IS_EPSILON_NODE (type));
1603 #endif
1604 #ifdef RE_ENABLE_I18N
1605       /* If the node may accept `multi byte'.  */
1606       if (dfa->nodes[prev_node].accept_mb)
1607         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1608                                          str_idx, sctx->last_str_idx);
1609 #endif /* RE_ENABLE_I18N */
1610
1611       /* We don't check backreferences here.
1612          See update_cur_sifted_state().  */
1613       if (!naccepted
1614           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1615           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1616                                   dfa->nexts[prev_node]))
1617         naccepted = 1;
1618
1619       if (naccepted == 0)
1620         continue;
1621
1622       if (sctx->limits.nelem)
1623         {
1624           int to_idx = str_idx + naccepted;
1625           if (check_dst_limits (mctx, &sctx->limits,
1626                                 dfa->nexts[prev_node], to_idx,
1627                                 prev_node, str_idx))
1628             continue;
1629         }
1630       ret = re_node_set_insert (cur_dest, prev_node);
1631       if (BE (ret == -1, 0))
1632         return REG_ESPACE;
1633     }
1634
1635   return REG_NOERROR;
1636 }
1637
1638 /* Helper functions.  */
1639
1640 static reg_errcode_t
1641 internal_function
1642 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1643 {
1644   int top = mctx->state_log_top;
1645
1646   if (next_state_log_idx >= mctx->input.bufs_len
1647       || (next_state_log_idx >= mctx->input.valid_len
1648           && mctx->input.valid_len < mctx->input.len))
1649     {
1650       reg_errcode_t err;
1651       err = extend_buffers (mctx);
1652       if (BE (err != REG_NOERROR, 0))
1653         return err;
1654     }
1655
1656   if (top < next_state_log_idx)
1657     {
1658       memset (mctx->state_log + top + 1, '\0',
1659               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1660       mctx->state_log_top = next_state_log_idx;
1661     }
1662   return REG_NOERROR;
1663 }
1664
1665 static reg_errcode_t
1666 internal_function
1667 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1668                    int num)
1669 {
1670   int st_idx;
1671   reg_errcode_t err;
1672   for (st_idx = 0; st_idx < num; ++st_idx)
1673     {
1674       if (dst[st_idx] == NULL)
1675         dst[st_idx] = src[st_idx];
1676       else if (src[st_idx] != NULL)
1677         {
1678           re_node_set merged_set;
1679           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1680                                         &src[st_idx]->nodes);
1681           if (BE (err != REG_NOERROR, 0))
1682             return err;
1683           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1684           re_node_set_free (&merged_set);
1685           if (BE (err != REG_NOERROR, 0))
1686             return err;
1687         }
1688     }
1689   return REG_NOERROR;
1690 }
1691
1692 static reg_errcode_t
1693 internal_function
1694 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1695                          int str_idx, re_node_set *dest_nodes)
1696 {
1697   re_dfa_t *const dfa = mctx->dfa;
1698   reg_errcode_t err;
1699   const re_node_set *candidates;
1700   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1701                 : &mctx->state_log[str_idx]->nodes);
1702
1703   if (dest_nodes->nelem == 0)
1704     sctx->sifted_states[str_idx] = NULL;
1705   else
1706     {
1707       if (candidates)
1708         {
1709           /* At first, add the nodes which can epsilon transit to a node in
1710              DEST_NODE.  */
1711           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1712           if (BE (err != REG_NOERROR, 0))
1713             return err;
1714
1715           /* Then, check the limitations in the current sift_context.  */
1716           if (sctx->limits.nelem)
1717             {
1718               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1719                                          mctx->bkref_ents, str_idx);
1720               if (BE (err != REG_NOERROR, 0))
1721                 return err;
1722             }
1723         }
1724
1725       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1726       if (BE (err != REG_NOERROR, 0))
1727         return err;
1728     }
1729
1730   if (candidates && mctx->state_log[str_idx]->has_backref)
1731     {
1732       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1733       if (BE (err != REG_NOERROR, 0))
1734         return err;
1735     }
1736   return REG_NOERROR;
1737 }
1738
1739 static reg_errcode_t
1740 internal_function
1741 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1742                        const re_node_set *candidates)
1743 {
1744   reg_errcode_t err = REG_NOERROR;
1745   int i;
1746
1747   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1748   if (BE (err != REG_NOERROR, 0))
1749     return err;
1750
1751   if (!state->inveclosure.alloc)
1752     {
1753       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1754       if (BE (err != REG_NOERROR, 0))
1755         return REG_ESPACE;
1756       for (i = 0; i < dest_nodes->nelem; i++)
1757         re_node_set_merge (&state->inveclosure,
1758                            dfa->inveclosures + dest_nodes->elems[i]);
1759     }
1760   return re_node_set_add_intersect (dest_nodes, candidates,
1761                                     &state->inveclosure);
1762 }
1763
1764 static reg_errcode_t
1765 internal_function
1766 sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1767                        const re_node_set *candidates)
1768 {
1769     int ecl_idx;
1770     reg_errcode_t err;
1771     re_node_set *inv_eclosure = dfa->inveclosures + node;
1772     re_node_set except_nodes;
1773     re_node_set_init_empty (&except_nodes);
1774     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1775       {
1776         int cur_node = inv_eclosure->elems[ecl_idx];
1777         if (cur_node == node)
1778           continue;
1779         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1780           {
1781             int edst1 = dfa->edests[cur_node].elems[0];
1782             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1783                          ? dfa->edests[cur_node].elems[1] : -1);
1784             if ((!re_node_set_contains (inv_eclosure, edst1)
1785                  && re_node_set_contains (dest_nodes, edst1))
1786                 || (edst2 > 0
1787                     && !re_node_set_contains (inv_eclosure, edst2)
1788                     && re_node_set_contains (dest_nodes, edst2)))
1789               {
1790                 err = re_node_set_add_intersect (&except_nodes, candidates,
1791                                                  dfa->inveclosures + cur_node);
1792                 if (BE (err != REG_NOERROR, 0))
1793                   {
1794                     re_node_set_free (&except_nodes);
1795                     return err;
1796                   }
1797               }
1798           }
1799       }
1800     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1801       {
1802         int cur_node = inv_eclosure->elems[ecl_idx];
1803         if (!re_node_set_contains (&except_nodes, cur_node))
1804           {
1805             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1806             re_node_set_remove_at (dest_nodes, idx);
1807           }
1808       }
1809     re_node_set_free (&except_nodes);
1810     return REG_NOERROR;
1811 }
1812
1813 static int
1814 internal_function
1815 check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
1816                   int dst_node, int dst_idx, int src_node, int src_idx)
1817 {
1818   re_dfa_t *const dfa = mctx->dfa;
1819   int lim_idx, src_pos, dst_pos;
1820
1821   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1822   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1823   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1824     {
1825       int subexp_idx;
1826       struct re_backref_cache_entry *ent;
1827       ent = mctx->bkref_ents + limits->elems[lim_idx];
1828       subexp_idx = dfa->nodes[ent->node].opr.idx;
1829
1830       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1831                                            subexp_idx, dst_node, dst_idx,
1832                                            dst_bkref_idx);
1833       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1834                                            subexp_idx, src_node, src_idx,
1835                                            src_bkref_idx);
1836
1837       /* In case of:
1838          <src> <dst> ( <subexp> )
1839          ( <subexp> ) <src> <dst>
1840          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1841       if (src_pos == dst_pos)
1842         continue; /* This is unrelated limitation.  */
1843       else
1844         return 1;
1845     }
1846   return 0;
1847 }
1848
1849 static int
1850 internal_function
1851 check_dst_limits_calc_pos_1 (re_match_context_t *mctx, int boundaries,
1852                              int subexp_idx, int from_node, int bkref_idx)
1853 {
1854   re_dfa_t *const dfa = mctx->dfa;
1855   re_node_set *eclosures = dfa->eclosures + from_node;
1856   int node_idx;
1857
1858   /* Else, we are on the boundary: examine the nodes on the epsilon
1859      closure.  */
1860   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1861     {
1862       int node = eclosures->elems[node_idx];
1863       switch (dfa->nodes[node].type)
1864         {
1865         case OP_BACK_REF:
1866           if (bkref_idx != -1)
1867             {
1868               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1869               do
1870                 {
1871                   int dst, cpos;
1872
1873                   if (ent->node != node)
1874                     continue;
1875
1876                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1877                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1878                     continue;
1879
1880                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1881                      OP_CLOSE_SUBEXP cases below.  But, if the
1882                      destination node is the same node as the source
1883                      node, don't recurse because it would cause an
1884                      infinite loop: a regex that exhibits this behavior
1885                      is ()\1*\1*  */
1886                   dst = dfa->edests[node].elems[0];
1887                   if (dst == from_node)
1888                     {
1889                       if (boundaries & 1)
1890                         return -1;
1891                       else /* if (boundaries & 2) */
1892                         return 0;
1893                     }
1894
1895                   cpos =
1896                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1897                                                  dst, bkref_idx);
1898                   if (cpos == -1 /* && (boundaries & 1) */)
1899                     return -1;
1900                   if (cpos == 0 && (boundaries & 2))
1901                     return 0;
1902
1903                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1904                 }
1905               while (ent++->more);
1906             }
1907           break;
1908
1909         case OP_OPEN_SUBEXP:
1910           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1911             return -1;
1912           break;
1913
1914         case OP_CLOSE_SUBEXP:
1915           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1916             return 0;
1917           break;
1918
1919         default:
1920             break;
1921         }
1922     }
1923
1924   return (boundaries & 2) ? 1 : 0;
1925 }
1926
1927 static int
1928 internal_function
1929 check_dst_limits_calc_pos (re_match_context_t *mctx, int limit, int subexp_idx,
1930                            int from_node, int str_idx, int bkref_idx)
1931 {
1932   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1933   int boundaries;
1934
1935   /* If we are outside the range of the subexpression, return -1 or 1.  */
1936   if (str_idx < lim->subexp_from)
1937     return -1;
1938
1939   if (lim->subexp_to < str_idx)
1940     return 1;
1941
1942   /* If we are within the subexpression, return 0.  */
1943   boundaries = (str_idx == lim->subexp_from);
1944   boundaries |= (str_idx == lim->subexp_to) << 1;
1945   if (boundaries == 0)
1946     return 0;
1947
1948   /* Else, examine epsilon closure.  */
1949   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1950                                       from_node, bkref_idx);
1951 }
1952
1953 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1954    which are against limitations from DEST_NODES. */
1955
1956 static reg_errcode_t
1957 internal_function
1958 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
1959                      const re_node_set *candidates, re_node_set *limits,
1960                      struct re_backref_cache_entry *bkref_ents, int str_idx)
1961 {
1962   reg_errcode_t err;
1963   int node_idx, lim_idx;
1964
1965   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1966     {
1967       int subexp_idx;
1968       struct re_backref_cache_entry *ent;
1969       ent = bkref_ents + limits->elems[lim_idx];
1970
1971       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1972         continue; /* This is unrelated limitation.  */
1973
1974       subexp_idx = dfa->nodes[ent->node].opr.idx;
1975       if (ent->subexp_to == str_idx)
1976         {
1977           int ops_node = -1;
1978           int cls_node = -1;
1979           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1980             {
1981               int node = dest_nodes->elems[node_idx];
1982               re_token_type_t type = dfa->nodes[node].type;
1983               if (type == OP_OPEN_SUBEXP
1984                   && subexp_idx == dfa->nodes[node].opr.idx)
1985                 ops_node = node;
1986               else if (type == OP_CLOSE_SUBEXP
1987                        && subexp_idx == dfa->nodes[node].opr.idx)
1988                 cls_node = node;
1989             }
1990
1991           /* Check the limitation of the open subexpression.  */
1992           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
1993           if (ops_node >= 0)
1994             {
1995               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1996                                            candidates);
1997               if (BE (err != REG_NOERROR, 0))
1998                 return err;
1999             }
2000
2001           /* Check the limitation of the close subexpression.  */
2002           if (cls_node >= 0)
2003             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2004               {
2005                 int node = dest_nodes->elems[node_idx];
2006                 if (!re_node_set_contains (dfa->inveclosures + node,
2007                                            cls_node)
2008                     && !re_node_set_contains (dfa->eclosures + node,
2009                                               cls_node))
2010                   {
2011                     /* It is against this limitation.
2012                        Remove it form the current sifted state.  */
2013                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2014                                                  candidates);
2015                     if (BE (err != REG_NOERROR, 0))
2016                       return err;
2017                     --node_idx;
2018                   }
2019               }
2020         }
2021       else /* (ent->subexp_to != str_idx)  */
2022         {
2023           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2024             {
2025               int node = dest_nodes->elems[node_idx];
2026               re_token_type_t type = dfa->nodes[node].type;
2027               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2028                 {
2029                   if (subexp_idx != dfa->nodes[node].opr.idx)
2030                     continue;
2031                   /* It is against this limitation.
2032                      Remove it form the current sifted state.  */
2033                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2034                                                candidates);
2035                   if (BE (err != REG_NOERROR, 0))
2036                     return err;
2037                 }
2038             }
2039         }
2040     }
2041   return REG_NOERROR;
2042 }
2043
2044 static reg_errcode_t
2045 internal_function
2046 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2047                    int str_idx, const re_node_set *candidates)
2048 {
2049   re_dfa_t *const dfa = mctx->dfa;
2050   reg_errcode_t err;
2051   int node_idx, node;
2052   re_sift_context_t local_sctx;
2053   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2054
2055   if (first_idx == -1)
2056     return REG_NOERROR;
2057
2058   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2059
2060   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2061     {
2062       int enabled_idx;
2063       re_token_type_t type;
2064       struct re_backref_cache_entry *entry;
2065       node = candidates->elems[node_idx];
2066       type = dfa->nodes[node].type;
2067       /* Avoid infinite loop for the REs like "()\1+".  */
2068       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2069         continue;
2070       if (type != OP_BACK_REF)
2071         continue;
2072
2073       entry = mctx->bkref_ents + first_idx;
2074       enabled_idx = first_idx;
2075       do
2076         {
2077           int subexp_len, to_idx, dst_node, ret;
2078           re_dfastate_t *cur_state;
2079
2080           if (entry->node != node)
2081             continue;
2082           subexp_len = entry->subexp_to - entry->subexp_from;
2083           to_idx = str_idx + subexp_len;
2084           dst_node = (subexp_len ? dfa->nexts[node]
2085                       : dfa->edests[node].elems[0]);
2086
2087           if (to_idx > sctx->last_str_idx
2088               || sctx->sifted_states[to_idx] == NULL
2089               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2090               || check_dst_limits (mctx, &sctx->limits, node,
2091                                    str_idx, dst_node, to_idx))
2092             continue;
2093
2094           if (local_sctx.sifted_states == NULL)
2095             {
2096               local_sctx = *sctx;
2097               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2098               if (BE (err != REG_NOERROR, 0))
2099                 goto free_return;
2100             }
2101           local_sctx.last_node = node;
2102           local_sctx.last_str_idx = str_idx;
2103           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2104           if (BE (ret < 0, 0))
2105             {
2106               err = REG_ESPACE;
2107               goto free_return;
2108             }
2109           cur_state = local_sctx.sifted_states[str_idx];
2110           err = sift_states_backward (mctx, &local_sctx);
2111           if (BE (err != REG_NOERROR, 0))
2112             goto free_return;
2113           if (sctx->limited_states != NULL)
2114             {
2115               err = merge_state_array (dfa, sctx->limited_states,
2116                                        local_sctx.sifted_states,
2117                                        str_idx + 1);
2118               if (BE (err != REG_NOERROR, 0))
2119                 goto free_return;
2120             }
2121           local_sctx.sifted_states[str_idx] = cur_state;
2122           re_node_set_remove (&local_sctx.limits, enabled_idx);
2123
2124           /* mctx->bkref_ents may have changed, reload the pointer.  */
2125           entry = mctx->bkref_ents + enabled_idx;
2126         }
2127       while (enabled_idx++, entry++->more);
2128     }
2129   err = REG_NOERROR;
2130  free_return:
2131   if (local_sctx.sifted_states != NULL)
2132     {
2133       re_node_set_free (&local_sctx.limits);
2134     }
2135
2136   return err;
2137 }
2138
2139
2140 #ifdef RE_ENABLE_I18N
2141 static int
2142 internal_function
2143 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2144                      int node_idx, int str_idx, int max_str_idx)
2145 {
2146   re_dfa_t *const dfa = mctx->dfa;
2147   int naccepted;
2148   /* Check the node can accept `multi byte'.  */
2149   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2150   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2151       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2152                             dfa->nexts[node_idx]))
2153     /* The node can't accept the `multi byte', or the
2154        destination was already thrown away, then the node
2155        could't accept the current input `multi byte'.   */
2156     naccepted = 0;
2157   /* Otherwise, it is sure that the node could accept
2158      `naccepted' bytes input.  */
2159   return naccepted;
2160 }
2161 #endif /* RE_ENABLE_I18N */
2162
2163 \f
2164 /* Functions for state transition.  */
2165
2166 /* Return the next state to which the current state STATE will transit by
2167    accepting the current input byte, and update STATE_LOG if necessary.
2168    If STATE can accept a multibyte char/collating element/back reference
2169    update the destination of STATE_LOG.  */
2170
2171 static re_dfastate_t *
2172 internal_function
2173 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2174                re_dfastate_t *state)
2175 {
2176   re_dfastate_t **trtable;
2177   unsigned char ch;
2178
2179 #ifdef RE_ENABLE_I18N
2180   /* If the current state can accept multibyte.  */
2181   if (BE (state->accept_mb, 0))
2182     {
2183       *err = transit_state_mb (mctx, state);
2184       if (BE (*err != REG_NOERROR, 0))
2185         return NULL;
2186     }
2187 #endif /* RE_ENABLE_I18N */
2188
2189   /* Then decide the next state with the single byte.  */
2190 #if 0
2191   if (0)
2192     /* don't use transition table  */
2193     return transit_state_sb (err, mctx, state);
2194 #endif
2195
2196   /* Use transition table  */
2197   ch = re_string_fetch_byte (&mctx->input);
2198   for (;;)
2199     {
2200       trtable = state->trtable;
2201       if (BE (trtable != NULL, 1))
2202         return trtable[ch];
2203
2204       trtable = state->word_trtable;
2205       if (BE (trtable != NULL, 1))
2206         {
2207           unsigned int context;
2208           context
2209             = re_string_context_at (&mctx->input,
2210                                     re_string_cur_idx (&mctx->input) - 1,
2211                                     mctx->eflags);
2212           if (IS_WORD_CONTEXT (context))
2213             return trtable[ch + SBC_MAX];
2214           else
2215             return trtable[ch];
2216         }
2217
2218       if (!build_trtable (mctx->dfa, state))
2219         {
2220           *err = REG_ESPACE;
2221           return NULL;
2222         }
2223
2224       /* Retry, we now have a transition table.  */
2225     }
2226 }
2227
2228 /* Update the state_log if we need */
2229 re_dfastate_t *
2230 internal_function
2231 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2232                       re_dfastate_t *next_state)
2233 {
2234   re_dfa_t *const dfa = mctx->dfa;
2235   int cur_idx = re_string_cur_idx (&mctx->input);
2236
2237   if (cur_idx > mctx->state_log_top)
2238     {
2239       mctx->state_log[cur_idx] = next_state;
2240       mctx->state_log_top = cur_idx;
2241     }
2242   else if (mctx->state_log[cur_idx] == 0)
2243     {
2244       mctx->state_log[cur_idx] = next_state;
2245     }
2246   else
2247     {
2248       re_dfastate_t *pstate;
2249       unsigned int context;
2250       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2251       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2252          the destination of a multibyte char/collating element/
2253          back reference.  Then the next state is the union set of
2254          these destinations and the results of the transition table.  */
2255       pstate = mctx->state_log[cur_idx];
2256       log_nodes = pstate->entrance_nodes;
2257       if (next_state != NULL)
2258         {
2259           table_nodes = next_state->entrance_nodes;
2260           *err = re_node_set_init_union (&next_nodes, table_nodes,
2261                                              log_nodes);
2262           if (BE (*err != REG_NOERROR, 0))
2263             return NULL;
2264         }
2265       else
2266         next_nodes = *log_nodes;
2267       /* Note: We already add the nodes of the initial state,
2268          then we don't need to add them here.  */
2269
2270       context = re_string_context_at (&mctx->input,
2271                                       re_string_cur_idx (&mctx->input) - 1,
2272                                       mctx->eflags);
2273       next_state = mctx->state_log[cur_idx]
2274         = re_acquire_state_context (err, dfa, &next_nodes, context);
2275       /* We don't need to check errors here, since the return value of
2276          this function is next_state and ERR is already set.  */
2277
2278       if (table_nodes != NULL)
2279         re_node_set_free (&next_nodes);
2280     }
2281
2282   if (BE (dfa->nbackref, 0) && next_state != NULL)
2283     {
2284       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2285          later.  We must check them here, since the back references in the
2286          next state might use them.  */
2287       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2288                                         cur_idx);
2289       if (BE (*err != REG_NOERROR, 0))
2290         return NULL;
2291
2292       /* If the next state has back references.  */
2293       if (next_state->has_backref)
2294         {
2295           *err = transit_state_bkref (mctx, &next_state->nodes);
2296           if (BE (*err != REG_NOERROR, 0))
2297             return NULL;
2298           next_state = mctx->state_log[cur_idx];
2299         }
2300     }
2301
2302   return next_state;
2303 }
2304
2305 /* Skip bytes in the input that correspond to part of a
2306    multi-byte match, then look in the log for a state
2307    from which to restart matching.  */
2308 static re_dfastate_t *
2309 internal_function
2310 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2311 {
2312   re_dfastate_t *cur_state = NULL;
2313   do
2314     {
2315       int max = mctx->state_log_top;
2316       int cur_str_idx = re_string_cur_idx (&mctx->input);
2317
2318       do
2319         {
2320           if (++cur_str_idx > max)
2321             return NULL;
2322           re_string_skip_bytes (&mctx->input, 1);
2323         }
2324       while (mctx->state_log[cur_str_idx] == NULL);
2325
2326       cur_state = merge_state_with_log (err, mctx, NULL);
2327     }
2328   while (*err == REG_NOERROR && cur_state == NULL);
2329   return cur_state;
2330 }
2331
2332 /* Helper functions for transit_state.  */
2333
2334 /* From the node set CUR_NODES, pick up the nodes whose types are
2335    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2336    expression. And register them to use them later for evaluating the
2337    correspoding back references.  */
2338
2339 static reg_errcode_t
2340 internal_function
2341 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2342                            int str_idx)
2343 {
2344   re_dfa_t *const dfa = mctx->dfa;
2345   int node_idx;
2346   reg_errcode_t err;
2347
2348   /* TODO: This isn't efficient.
2349            Because there might be more than one nodes whose types are
2350            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2351            nodes.
2352            E.g. RE: (a){2}  */
2353   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2354     {
2355       int node = cur_nodes->elems[node_idx];
2356       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2357           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2358           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2359         {
2360           err = match_ctx_add_subtop (mctx, node, str_idx);
2361           if (BE (err != REG_NOERROR, 0))
2362             return err;
2363         }
2364     }
2365   return REG_NOERROR;
2366 }
2367
2368 #if 0
2369 /* Return the next state to which the current state STATE will transit by
2370    accepting the current input byte.  */
2371
2372 static re_dfastate_t *
2373 transit_state_sb (err, mctx, state)
2374      reg_errcode_t *err;
2375      re_match_context_t *mctx;
2376      re_dfastate_t *state;
2377 {
2378   re_dfa_t *const dfa = mctx->dfa;
2379   re_node_set next_nodes;
2380   re_dfastate_t *next_state;
2381   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2382   unsigned int context;
2383
2384   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2385   if (BE (*err != REG_NOERROR, 0))
2386     return NULL;
2387   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2388     {
2389       int cur_node = state->nodes.elems[node_cnt];
2390       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2391         {
2392           *err = re_node_set_merge (&next_nodes,
2393                                     dfa->eclosures + dfa->nexts[cur_node]);
2394           if (BE (*err != REG_NOERROR, 0))
2395             {
2396               re_node_set_free (&next_nodes);
2397               return NULL;
2398             }
2399         }
2400     }
2401   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2402   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2403   /* We don't need to check errors here, since the return value of
2404      this function is next_state and ERR is already set.  */
2405
2406   re_node_set_free (&next_nodes);
2407   re_string_skip_bytes (&mctx->input, 1);
2408   return next_state;
2409 }
2410 #endif
2411
2412 #ifdef RE_ENABLE_I18N
2413 static reg_errcode_t
2414 internal_function
2415 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2416 {
2417   re_dfa_t *const dfa = mctx->dfa;
2418   reg_errcode_t err;
2419   int i;
2420
2421   for (i = 0; i < pstate->nodes.nelem; ++i)
2422     {
2423       re_node_set dest_nodes, *new_nodes;
2424       int cur_node_idx = pstate->nodes.elems[i];
2425       int naccepted, dest_idx;
2426       unsigned int context;
2427       re_dfastate_t *dest_state;
2428
2429       if (!dfa->nodes[cur_node_idx].accept_mb)
2430         continue;
2431
2432       if (dfa->nodes[cur_node_idx].constraint)
2433         {
2434           context = re_string_context_at (&mctx->input,
2435                                           re_string_cur_idx (&mctx->input),
2436                                           mctx->eflags);
2437           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2438                                            context))
2439             continue;
2440         }
2441
2442       /* How many bytes the node can accept?  */
2443       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2444                                            re_string_cur_idx (&mctx->input));
2445       if (naccepted == 0)
2446         continue;
2447
2448       /* The node can accepts `naccepted' bytes.  */
2449       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2450       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2451                                : mctx->max_mb_elem_len);
2452       err = clean_state_log_if_needed (mctx, dest_idx);
2453       if (BE (err != REG_NOERROR, 0))
2454         return err;
2455 #ifdef DEBUG
2456       assert (dfa->nexts[cur_node_idx] != -1);
2457 #endif
2458       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2459
2460       dest_state = mctx->state_log[dest_idx];
2461       if (dest_state == NULL)
2462         dest_nodes = *new_nodes;
2463       else
2464         {
2465           err = re_node_set_init_union (&dest_nodes,
2466                                         dest_state->entrance_nodes, new_nodes);
2467           if (BE (err != REG_NOERROR, 0))
2468             return err;
2469         }
2470       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2471       mctx->state_log[dest_idx]
2472         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2473       if (dest_state != NULL)
2474         re_node_set_free (&dest_nodes);
2475       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2476         return err;
2477     }
2478   return REG_NOERROR;
2479 }
2480 #endif /* RE_ENABLE_I18N */
2481
2482 static reg_errcode_t
2483 internal_function
2484 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2485 {
2486   re_dfa_t *const dfa = mctx->dfa;
2487   reg_errcode_t err;
2488   int i;
2489   int cur_str_idx = re_string_cur_idx (&mctx->input);
2490
2491   for (i = 0; i < nodes->nelem; ++i)
2492     {
2493       int dest_str_idx, prev_nelem, bkc_idx;
2494       int node_idx = nodes->elems[i];
2495       unsigned int context;
2496       const re_token_t *node = dfa->nodes + node_idx;
2497       re_node_set *new_dest_nodes;
2498
2499       /* Check whether `node' is a backreference or not.  */
2500       if (node->type != OP_BACK_REF)
2501         continue;
2502
2503       if (node->constraint)
2504         {
2505           context = re_string_context_at (&mctx->input, cur_str_idx,
2506                                           mctx->eflags);
2507           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2508             continue;
2509         }
2510
2511       /* `node' is a backreference.
2512          Check the substring which the substring matched.  */
2513       bkc_idx = mctx->nbkref_ents;
2514       err = get_subexp (mctx, node_idx, cur_str_idx);
2515       if (BE (err != REG_NOERROR, 0))
2516         goto free_return;
2517
2518       /* And add the epsilon closures (which is `new_dest_nodes') of
2519          the backreference to appropriate state_log.  */
2520 #ifdef DEBUG
2521       assert (dfa->nexts[node_idx] != -1);
2522 #endif
2523       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2524         {
2525           int subexp_len;
2526           re_dfastate_t *dest_state;
2527           struct re_backref_cache_entry *bkref_ent;
2528           bkref_ent = mctx->bkref_ents + bkc_idx;
2529           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2530             continue;
2531           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2532           new_dest_nodes = (subexp_len == 0
2533                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2534                             : dfa->eclosures + dfa->nexts[node_idx]);
2535           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2536                           - bkref_ent->subexp_from);
2537           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2538                                           mctx->eflags);
2539           dest_state = mctx->state_log[dest_str_idx];
2540           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2541                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2542           /* Add `new_dest_node' to state_log.  */
2543           if (dest_state == NULL)
2544             {
2545               mctx->state_log[dest_str_idx]
2546                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2547                                             context);
2548               if (BE (mctx->state_log[dest_str_idx] == NULL
2549                       && err != REG_NOERROR, 0))
2550                 goto free_return;
2551             }
2552           else
2553             {
2554               re_node_set dest_nodes;
2555               err = re_node_set_init_union (&dest_nodes,
2556                                             dest_state->entrance_nodes,
2557                                             new_dest_nodes);
2558               if (BE (err != REG_NOERROR, 0))
2559                 {
2560                   re_node_set_free (&dest_nodes);
2561                   goto free_return;
2562                 }
2563               mctx->state_log[dest_str_idx]
2564                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2565               re_node_set_free (&dest_nodes);
2566               if (BE (mctx->state_log[dest_str_idx] == NULL
2567                       && err != REG_NOERROR, 0))
2568                 goto free_return;
2569             }
2570           /* We need to check recursively if the backreference can epsilon
2571              transit.  */
2572           if (subexp_len == 0
2573               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2574             {
2575               err = check_subexp_matching_top (mctx, new_dest_nodes,
2576                                                cur_str_idx);
2577               if (BE (err != REG_NOERROR, 0))
2578                 goto free_return;
2579               err = transit_state_bkref (mctx, new_dest_nodes);
2580               if (BE (err != REG_NOERROR, 0))
2581                 goto free_return;
2582             }
2583         }
2584     }
2585   err = REG_NOERROR;
2586  free_return:
2587   return err;
2588 }
2589
2590 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2591    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2592    Note that we might collect inappropriate candidates here.
2593    However, the cost of checking them strictly here is too high, then we
2594    delay these checking for prune_impossible_nodes().  */
2595
2596 static reg_errcode_t
2597 internal_function
2598 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2599 {
2600   re_dfa_t *const dfa = mctx->dfa;
2601   int subexp_num, sub_top_idx;
2602   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2603   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2604   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2605   if (cache_idx != -1)
2606     {
2607       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2608       do
2609         if (entry->node == bkref_node)
2610           return REG_NOERROR; /* We already checked it.  */
2611       while (entry++->more);
2612     }
2613
2614   subexp_num = dfa->nodes[bkref_node].opr.idx;
2615
2616   /* For each sub expression  */
2617   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2618     {
2619       reg_errcode_t err;
2620       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2621       re_sub_match_last_t *sub_last;
2622       int sub_last_idx, sl_str, bkref_str_off;
2623
2624       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2625         continue; /* It isn't related.  */
2626
2627       sl_str = sub_top->str_idx;
2628       bkref_str_off = bkref_str_idx;
2629       /* At first, check the last node of sub expressions we already
2630          evaluated.  */
2631       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2632         {
2633           int sl_str_diff;
2634           sub_last = sub_top->lasts[sub_last_idx];
2635           sl_str_diff = sub_last->str_idx - sl_str;
2636           /* The matched string by the sub expression match with the substring
2637              at the back reference?  */
2638           if (sl_str_diff > 0)
2639             {
2640               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2641                 {
2642                   /* Not enough chars for a successful match.  */
2643                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2644                     break;
2645
2646                   err = clean_state_log_if_needed (mctx,
2647                                                    bkref_str_off
2648                                                    + sl_str_diff);
2649                   if (BE (err != REG_NOERROR, 0))
2650                     return err;
2651                   buf = (const char *) re_string_get_buffer (&mctx->input);
2652                 }
2653               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2654                 break; /* We don't need to search this sub expression any more.  */
2655             }
2656           bkref_str_off += sl_str_diff;
2657           sl_str += sl_str_diff;
2658           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2659                                 bkref_str_idx);
2660
2661           /* Reload buf, since the preceding call might have reallocated
2662              the buffer.  */
2663           buf = (const char *) re_string_get_buffer (&mctx->input);
2664
2665           if (err == REG_NOMATCH)
2666             continue;
2667           if (BE (err != REG_NOERROR, 0))
2668             return err;
2669         }
2670
2671       if (sub_last_idx < sub_top->nlasts)
2672         continue;
2673       if (sub_last_idx > 0)
2674         ++sl_str;
2675       /* Then, search for the other last nodes of the sub expression.  */
2676       for (; sl_str <= bkref_str_idx; ++sl_str)
2677         {
2678           int cls_node, sl_str_off;
2679           const re_node_set *nodes;
2680           sl_str_off = sl_str - sub_top->str_idx;
2681           /* The matched string by the sub expression match with the substring
2682              at the back reference?  */
2683           if (sl_str_off > 0)
2684             {
2685               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2686                 {
2687                   /* If we are at the end of the input, we cannot match.  */
2688                   if (bkref_str_off >= mctx->input.len)
2689                     break;
2690
2691                   err = extend_buffers (mctx);
2692                   if (BE (err != REG_NOERROR, 0))
2693                     return err;
2694
2695                   buf = (const char *) re_string_get_buffer (&mctx->input);
2696                 }
2697               if (buf [bkref_str_off++] != buf[sl_str - 1])
2698                 break; /* We don't need to search this sub expression
2699                           any more.  */
2700             }
2701           if (mctx->state_log[sl_str] == NULL)
2702             continue;
2703           /* Does this state have a ')' of the sub expression?  */
2704           nodes = &mctx->state_log[sl_str]->nodes;
2705           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2706           if (cls_node == -1)
2707             continue; /* No.  */
2708           if (sub_top->path == NULL)
2709             {
2710               sub_top->path = re_calloc (state_array_t,
2711                                          sl_str - sub_top->str_idx + 1);
2712               if (sub_top->path == NULL)
2713                 return REG_ESPACE;
2714             }
2715           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2716              in the current context?  */
2717           err = check_arrival (mctx, sub_top->path, sub_top->node,
2718                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2719           if (err == REG_NOMATCH)
2720               continue;
2721           if (BE (err != REG_NOERROR, 0))
2722               return err;
2723           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2724           if (BE (sub_last == NULL, 0))
2725             return REG_ESPACE;
2726           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2727                                 bkref_str_idx);
2728           if (err == REG_NOMATCH)
2729             continue;
2730         }
2731     }
2732   return REG_NOERROR;
2733 }
2734
2735 /* Helper functions for get_subexp().  */
2736
2737 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2738    If it can arrive, register the sub expression expressed with SUB_TOP
2739    and SUB_LAST.  */
2740
2741 static reg_errcode_t
2742 internal_function
2743 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2744                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2745 {
2746   reg_errcode_t err;
2747   int to_idx;
2748   /* Can the subexpression arrive the back reference?  */
2749   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2750                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2751   if (err != REG_NOERROR)
2752     return err;
2753   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2754                              sub_last->str_idx);
2755   if (BE (err != REG_NOERROR, 0))
2756     return err;
2757   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2758   return clean_state_log_if_needed (mctx, to_idx);
2759 }
2760
2761 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2762    Search '(' if FL_OPEN, or search ')' otherwise.
2763    TODO: This function isn't efficient...
2764          Because there might be more than one nodes whose types are
2765          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2766          nodes.
2767          E.g. RE: (a){2}  */
2768
2769 static int
2770 internal_function
2771 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2772                   int subexp_idx, int type)
2773 {
2774   int cls_idx;
2775   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2776     {
2777       int cls_node = nodes->elems[cls_idx];
2778       const re_token_t *node = dfa->nodes + cls_node;
2779       if (node->type == type
2780           && node->opr.idx == subexp_idx)
2781         return cls_node;
2782     }
2783   return -1;
2784 }
2785
2786 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2787    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2788    heavily reused.
2789    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2790
2791 static reg_errcode_t
2792 internal_function
2793 check_arrival (re_match_context_t *mctx, state_array_t *path,
2794                int top_node, int top_str, int last_node, int last_str,
2795                int type)
2796 {
2797   re_dfa_t *const dfa = mctx->dfa;
2798   reg_errcode_t err;
2799   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2800   re_dfastate_t *cur_state = NULL;
2801   re_node_set *cur_nodes, next_nodes;
2802   re_dfastate_t **backup_state_log;
2803   unsigned int context;
2804
2805   subexp_num = dfa->nodes[top_node].opr.idx;
2806   /* Extend the buffer if we need.  */
2807   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2808     {
2809       re_dfastate_t **new_array;
2810       int old_alloc = path->alloc;
2811       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2812       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2813       if (new_array == NULL)
2814         {
2815           path->alloc = old_alloc;
2816           return REG_ESPACE;
2817         }
2818       path->array = new_array;
2819       memset (new_array + old_alloc, '\0',
2820               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2821     }
2822
2823   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2824
2825   /* Temporary modify MCTX.  */
2826   backup_state_log = mctx->state_log;
2827   backup_cur_idx = mctx->input.cur_idx;
2828   mctx->state_log = path->array;
2829   mctx->input.cur_idx = str_idx;
2830
2831   /* Setup initial node set.  */
2832   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2833   if (str_idx == top_str)
2834     {
2835       err = re_node_set_init_1 (&next_nodes, top_node);
2836       if (BE (err != REG_NOERROR, 0))
2837         return err;
2838       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2839       if (BE (err != REG_NOERROR, 0))
2840         {
2841           re_node_set_free (&next_nodes);
2842           return err;
2843         }
2844     }
2845   else
2846     {
2847       cur_state = mctx->state_log[str_idx];
2848       if (cur_state && cur_state->has_backref)
2849         {
2850           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2851           if (BE ( err != REG_NOERROR, 0))
2852             return err;
2853         }
2854       else
2855         re_node_set_init_empty (&next_nodes);
2856     }
2857   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2858     {
2859       if (next_nodes.nelem)
2860         {
2861           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2862                                     subexp_num, type);
2863           if (BE ( err != REG_NOERROR, 0))
2864             {
2865               re_node_set_free (&next_nodes);
2866               return err;
2867             }
2868         }
2869       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2870       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2871         {
2872           re_node_set_free (&next_nodes);
2873           return err;
2874         }
2875       mctx->state_log[str_idx] = cur_state;
2876     }
2877
2878   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2879     {
2880       re_node_set_empty (&next_nodes);
2881       if (mctx->state_log[str_idx + 1])
2882         {
2883           err = re_node_set_merge (&next_nodes,
2884                                    &mctx->state_log[str_idx + 1]->nodes);
2885           if (BE (err != REG_NOERROR, 0))
2886             {
2887               re_node_set_free (&next_nodes);
2888               return err;
2889             }
2890         }
2891       if (cur_state)
2892         {
2893           err = check_arrival_add_next_nodes (mctx, str_idx,
2894                                               &cur_state->non_eps_nodes, &next_nodes);
2895           if (BE (err != REG_NOERROR, 0))
2896             {
2897               re_node_set_free (&next_nodes);
2898               return err;
2899             }
2900         }
2901       ++str_idx;
2902       if (next_nodes.nelem)
2903         {
2904           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2905           if (BE (err != REG_NOERROR, 0))
2906             {
2907               re_node_set_free (&next_nodes);
2908               return err;
2909             }
2910           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2911                                     subexp_num, type);
2912           if (BE ( err != REG_NOERROR, 0))
2913             {
2914               re_node_set_free (&next_nodes);
2915               return err;
2916             }
2917         }
2918       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2919       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2920       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2921         {
2922           re_node_set_free (&next_nodes);
2923           return err;
2924         }
2925       mctx->state_log[str_idx] = cur_state;
2926       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2927     }
2928   re_node_set_free (&next_nodes);
2929   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2930                : &mctx->state_log[last_str]->nodes);
2931   path->next_idx = str_idx;
2932
2933   /* Fix MCTX.  */
2934   mctx->state_log = backup_state_log;
2935   mctx->input.cur_idx = backup_cur_idx;
2936
2937   /* Then check the current node set has the node LAST_NODE.  */
2938   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2939     return REG_NOERROR;
2940
2941   return REG_NOMATCH;
2942 }
2943
2944 /* Helper functions for check_arrival.  */
2945
2946 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2947    to NEXT_NODES.
2948    TODO: This function is similar to the functions transit_state*(),
2949          however this function has many additional works.
2950          Can't we unify them?  */
2951
2952 static reg_errcode_t
2953 internal_function
2954 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2955                               re_node_set *cur_nodes,
2956                               re_node_set *next_nodes)
2957 {
2958   re_dfa_t *const dfa = mctx->dfa;
2959   int result;
2960   int cur_idx;
2961   reg_errcode_t err;
2962   re_node_set union_set;
2963   re_node_set_init_empty (&union_set);
2964   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2965     {
2966       int naccepted = 0;
2967       int cur_node = cur_nodes->elems[cur_idx];
2968 #ifdef DEBUG
2969       re_token_type_t type = dfa->nodes[cur_node].type;
2970       assert (!IS_EPSILON_NODE (type));
2971 #endif
2972 #ifdef RE_ENABLE_I18N
2973       /* If the node may accept `multi byte'.  */
2974       if (dfa->nodes[cur_node].accept_mb)
2975         {
2976           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2977                                                str_idx);
2978           if (naccepted > 1)
2979             {
2980               re_dfastate_t *dest_state;
2981               int next_node = dfa->nexts[cur_node];
2982               int next_idx = str_idx + naccepted;
2983               dest_state = mctx->state_log[next_idx];
2984               re_node_set_empty (&union_set);
2985               if (dest_state)
2986                 {
2987                   err = re_node_set_merge (&union_set, &dest_state->nodes);
2988                   if (BE (err != REG_NOERROR, 0))
2989                     {
2990                       re_node_set_free (&union_set);
2991                       return err;
2992                     }
2993                 }
2994               result = re_node_set_insert (&union_set, next_node);
2995               if (BE (result < 0, 0))
2996                 {
2997                   re_node_set_free (&union_set);
2998                   return REG_ESPACE;
2999                 }
3000               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3001                                                             &union_set);
3002               if (BE (mctx->state_log[next_idx] == NULL
3003                       && err != REG_NOERROR, 0))
3004                 {
3005                   re_node_set_free (&union_set);
3006                   return err;
3007                 }
3008             }
3009         }
3010 #endif /* RE_ENABLE_I18N */
3011       if (naccepted
3012           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3013         {
3014           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3015           if (BE (result < 0, 0))
3016             {
3017               re_node_set_free (&union_set);
3018               return REG_ESPACE;
3019             }
3020         }
3021     }
3022   re_node_set_free (&union_set);
3023   return REG_NOERROR;
3024 }
3025
3026 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3027    CUR_NODES, however exclude the nodes which are:
3028     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3029     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3030 */
3031
3032 static reg_errcode_t
3033 internal_function
3034 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3035                           int ex_subexp, int type)
3036 {
3037   reg_errcode_t err;
3038   int idx, outside_node;
3039   re_node_set new_nodes;
3040 #ifdef DEBUG
3041   assert (cur_nodes->nelem);
3042 #endif
3043   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3044   if (BE (err != REG_NOERROR, 0))
3045     return err;
3046   /* Create a new node set NEW_NODES with the nodes which are epsilon
3047      closures of the node in CUR_NODES.  */
3048
3049   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3050     {
3051       int cur_node = cur_nodes->elems[idx];
3052       re_node_set *eclosure = dfa->eclosures + cur_node;
3053       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3054       if (outside_node == -1)
3055         {
3056           /* There are no problematic nodes, just merge them.  */
3057           err = re_node_set_merge (&new_nodes, eclosure);
3058           if (BE (err != REG_NOERROR, 0))
3059             {
3060               re_node_set_free (&new_nodes);
3061               return err;
3062             }
3063         }
3064       else
3065         {
3066           /* There are problematic nodes, re-calculate incrementally.  */
3067           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3068                                               ex_subexp, type);
3069           if (BE (err != REG_NOERROR, 0))
3070             {
3071               re_node_set_free (&new_nodes);
3072               return err;
3073             }
3074         }
3075     }
3076   re_node_set_free (cur_nodes);
3077   *cur_nodes = new_nodes;
3078   return REG_NOERROR;
3079 }
3080
3081 /* Helper function for check_arrival_expand_ecl.
3082    Check incrementally the epsilon closure of TARGET, and if it isn't
3083    problematic append it to DST_NODES.  */
3084
3085 static reg_errcode_t
3086 internal_function
3087 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3088                               int target, int ex_subexp, int type)
3089 {
3090   int cur_node;
3091   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3092     {
3093       int err;
3094
3095       if (dfa->nodes[cur_node].type == type
3096           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3097         {
3098           if (type == OP_CLOSE_SUBEXP)
3099             {
3100               err = re_node_set_insert (dst_nodes, cur_node);
3101               if (BE (err == -1, 0))
3102                 return REG_ESPACE;
3103             }
3104           break;
3105         }
3106       err = re_node_set_insert (dst_nodes, cur_node);
3107       if (BE (err == -1, 0))
3108         return REG_ESPACE;
3109       if (dfa->edests[cur_node].nelem == 0)
3110         break;
3111       if (dfa->edests[cur_node].nelem == 2)
3112         {
3113           reg_errcode_t ret =
3114             check_arrival_expand_ecl_sub (dfa, dst_nodes,
3115                                           dfa->edests[cur_node].elems[1],
3116                                           ex_subexp, type);
3117           if (BE (ret != REG_NOERROR, 0))
3118             return ret;
3119         }
3120       cur_node = dfa->edests[cur_node].elems[0];
3121     }
3122   return REG_NOERROR;
3123 }
3124
3125
3126 /* For all the back references in the current state, calculate the
3127    destination of the back references by the appropriate entry
3128    in MCTX->BKREF_ENTS.  */
3129
3130 static reg_errcode_t
3131 internal_function
3132 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3133                     int cur_str, int subexp_num, int type)
3134 {
3135   re_dfa_t *const dfa = mctx->dfa;
3136   reg_errcode_t err;
3137   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3138   struct re_backref_cache_entry *ent;
3139
3140   if (cache_idx_start == -1)
3141     return REG_NOERROR;
3142
3143  restart:
3144   ent = mctx->bkref_ents + cache_idx_start;
3145   do
3146     {
3147       int to_idx, next_node;
3148
3149       /* Is this entry ENT is appropriate?  */
3150       if (!re_node_set_contains (cur_nodes, ent->node))
3151         continue; /* No.  */
3152
3153       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3154       /* Calculate the destination of the back reference, and append it
3155          to MCTX->STATE_LOG.  */
3156       if (to_idx == cur_str)
3157         {
3158           /* The backreference did epsilon transit, we must re-check all the
3159              node in the current state.  */
3160           re_node_set new_dests;
3161           reg_errcode_t err2, err3;
3162           next_node = dfa->edests[ent->node].elems[0];
3163           if (re_node_set_contains (cur_nodes, next_node))
3164             continue;
3165           err = re_node_set_init_1 (&new_dests, next_node);
3166           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3167           err3 = re_node_set_merge (cur_nodes, &new_dests);
3168           re_node_set_free (&new_dests);
3169           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3170                   || err3 != REG_NOERROR, 0))
3171             {
3172               err = (err != REG_NOERROR ? err
3173                      : (err2 != REG_NOERROR ? err2 : err3));
3174               return err;
3175             }
3176           /* TODO: It is still inefficient...  */
3177           goto restart;
3178         }
3179       else
3180         {
3181           re_node_set union_set;
3182           next_node = dfa->nexts[ent->node];
3183           if (mctx->state_log[to_idx])
3184             {
3185               int ret;
3186               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3187                                         next_node))
3188                 continue;
3189               err = re_node_set_init_copy (&union_set,
3190                                            &mctx->state_log[to_idx]->nodes);
3191               ret = re_node_set_insert (&union_set, next_node);
3192               if (BE (err != REG_NOERROR || ret < 0, 0))
3193                 {
3194                   re_node_set_free (&union_set);
3195                   err = err != REG_NOERROR ? err : REG_ESPACE;
3196                   return err;
3197                 }
3198             }
3199           else
3200             {
3201               err = re_node_set_init_1 (&union_set, next_node);
3202               if (BE (err != REG_NOERROR, 0))
3203                 return err;
3204             }
3205           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3206           re_node_set_free (&union_set);
3207           if (BE (mctx->state_log[to_idx] == NULL
3208                   && err != REG_NOERROR, 0))
3209             return err;
3210         }
3211     }
3212   while (ent++->more);
3213   return REG_NOERROR;
3214 }
3215
3216 /* Build transition table for the state.
3217    Return 1 if succeeded, otherwise return NULL.  */
3218
3219 static int
3220 internal_function
3221 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3222 {
3223   reg_errcode_t err;
3224   int i, j, ch, need_word_trtable = 0;
3225   unsigned int elem, mask;
3226   int dests_node_malloced = 0, dest_states_malloced = 0;
3227   int ndests; /* Number of the destination states from `state'.  */
3228   re_dfastate_t **trtable;
3229   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3230   re_node_set follows, *dests_node;
3231   bitset *dests_ch;
3232   bitset acceptable;
3233
3234   /* We build DFA states which corresponds to the destination nodes
3235      from `state'.  `dests_node[i]' represents the nodes which i-th
3236      destination state contains, and `dests_ch[i]' represents the
3237      characters which i-th destination state accepts.  */
3238 #ifdef _LIBC
3239   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3240     dests_node = (re_node_set *)
3241       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3242   else
3243 #endif
3244     {
3245       dests_node = (re_node_set *)
3246         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3247       if (BE (dests_node == NULL, 0))
3248         return 0;
3249       dests_node_malloced = 1;
3250     }
3251   dests_ch = (bitset *) (dests_node + SBC_MAX);
3252
3253   /* Initialize transiton table.  */
3254   state->word_trtable = state->trtable = NULL;
3255
3256   /* At first, group all nodes belonging to `state' into several
3257      destinations.  */
3258   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3259   if (BE (ndests <= 0, 0))
3260     {
3261       if (dests_node_malloced)
3262         free (dests_node);
3263       /* Return 0 in case of an error, 1 otherwise.  */
3264       if (ndests == 0)
3265         {
3266           state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3267           return 1;
3268         }
3269       return 0;
3270     }
3271
3272   err = re_node_set_alloc (&follows, ndests + 1);
3273   if (BE (err != REG_NOERROR, 0))
3274     goto out_free;
3275
3276 #ifdef _LIBC
3277   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3278                          + ndests * 3 * sizeof (re_dfastate_t *)))
3279     dest_states = (re_dfastate_t **)
3280       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3281   else
3282 #endif
3283     {
3284       dest_states = (re_dfastate_t **)
3285         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3286       if (BE (dest_states == NULL, 0))
3287         {
3288 out_free:
3289           if (dest_states_malloced)
3290             free (dest_states);
3291           re_node_set_free (&follows);
3292           for (i = 0; i < ndests; ++i)
3293             re_node_set_free (dests_node + i);
3294           if (dests_node_malloced)
3295             free (dests_node);
3296           return 0;
3297         }
3298       dest_states_malloced = 1;
3299     }
3300   dest_states_word = dest_states + ndests;
3301   dest_states_nl = dest_states_word + ndests;
3302   bitset_empty (acceptable);
3303
3304   /* Then build the states for all destinations.  */
3305   for (i = 0; i < ndests; ++i)
3306     {
3307       int next_node;
3308       re_node_set_empty (&follows);
3309       /* Merge the follows of this destination states.  */
3310       for (j = 0; j < dests_node[i].nelem; ++j)
3311         {
3312           next_node = dfa->nexts[dests_node[i].elems[j]];
3313           if (next_node != -1)
3314             {
3315               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3316               if (BE (err != REG_NOERROR, 0))
3317                 goto out_free;
3318             }
3319         }
3320       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3321       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3322         goto out_free;
3323       /* If the new state has context constraint,
3324          build appropriate states for these contexts.  */
3325       if (dest_states[i]->has_constraint)
3326         {
3327           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3328                                                           CONTEXT_WORD);
3329           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3330             goto out_free;
3331
3332           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3333             need_word_trtable = 1;
3334
3335           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3336                                                         CONTEXT_NEWLINE);
3337           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3338             goto out_free;
3339         }
3340       else
3341         {
3342           dest_states_word[i] = dest_states[i];
3343           dest_states_nl[i] = dest_states[i];
3344         }
3345       bitset_merge (acceptable, dests_ch[i]);
3346     }
3347
3348   if (!BE (need_word_trtable, 0))
3349     {
3350       /* We don't care about whether the following character is a word
3351          character, or we are in a single-byte character set so we can
3352          discern by looking at the character code: allocate a
3353          256-entry transition table.  */
3354       trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3355       if (BE (trtable == NULL, 0))
3356         goto out_free;
3357
3358       /* For all characters ch...:  */
3359       for (i = 0; i < BITSET_UINTS; ++i)
3360         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3361              elem;
3362              mask <<= 1, elem >>= 1, ++ch)
3363           if (BE (elem & 1, 0))
3364             {
3365               /* There must be exactly one destination which accepts
3366                  character ch.  See group_nodes_into_DFAstates.  */
3367               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3368                 ;
3369
3370               /* j-th destination accepts the word character ch.  */
3371               if (dfa->word_char[i] & mask)
3372                 trtable[ch] = dest_states_word[j];
3373               else
3374                 trtable[ch] = dest_states[j];
3375             }
3376     }
3377   else
3378     {
3379       /* We care about whether the following character is a word
3380          character, and we are in a multi-byte character set: discern
3381          by looking at the character code: build two 256-entry
3382          transition tables, one starting at trtable[0] and one
3383          starting at trtable[SBC_MAX].  */
3384       trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3385       if (BE (trtable == NULL, 0))
3386         goto out_free;
3387
3388       /* For all characters ch...:  */
3389       for (i = 0; i < BITSET_UINTS; ++i)
3390         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3391              elem;
3392              mask <<= 1, elem >>= 1, ++ch)
3393           if (BE (elem & 1, 0))
3394             {
3395               /* There must be exactly one destination which accepts
3396                  character ch.  See group_nodes_into_DFAstates.  */
3397               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3398                 ;
3399
3400               /* j-th destination accepts the word character ch.  */
3401               trtable[ch] = dest_states[j];
3402               trtable[ch + SBC_MAX] = dest_states_word[j];
3403             }
3404     }
3405
3406   /* new line */
3407   if (bitset_contain (acceptable, NEWLINE_CHAR))
3408     {
3409       /* The current state accepts newline character.  */
3410       for (j = 0; j < ndests; ++j)
3411         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3412           {
3413             /* k-th destination accepts newline character.  */
3414             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3415             if (need_word_trtable)
3416               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3417             /* There must be only one destination which accepts
3418                newline.  See group_nodes_into_DFAstates.  */
3419             break;
3420           }
3421     }
3422
3423   if (dest_states_malloced)
3424     free (dest_states);
3425
3426   re_node_set_free (&follows);
3427   for (i = 0; i < ndests; ++i)
3428     re_node_set_free (dests_node + i);
3429
3430   if (dests_node_malloced)
3431     free (dests_node);
3432
3433   return 1;
3434 }
3435
3436 /* Group all nodes belonging to STATE into several destinations.
3437    Then for all destinations, set the nodes belonging to the destination
3438    to DESTS_NODE[i] and set the characters accepted by the destination
3439    to DEST_CH[i].  This function return the number of destinations.  */
3440
3441 static int
3442 internal_function
3443 group_nodes_into_DFAstates (re_dfa_t *dfa, const re_dfastate_t *state,
3444                             re_node_set *dests_node, bitset *dests_ch)
3445 {
3446   reg_errcode_t err;
3447   int result;
3448   int i, j, k;
3449   int ndests; /* Number of the destinations from `state'.  */
3450   bitset accepts; /* Characters a node can accept.  */
3451   const re_node_set *cur_nodes = &state->nodes;
3452   bitset_empty (accepts);
3453   ndests = 0;
3454
3455   /* For all the nodes belonging to `state',  */
3456   for (i = 0; i < cur_nodes->nelem; ++i)
3457     {
3458       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3459       re_token_type_t type = node->type;
3460       unsigned int constraint = node->constraint;
3461
3462       /* Enumerate all single byte character this node can accept.  */
3463       if (type == CHARACTER)
3464         bitset_set (accepts, node->opr.c);
3465       else if (type == SIMPLE_BRACKET)
3466         {
3467           bitset_merge (accepts, node->opr.sbcset);
3468         }
3469       else if (type == OP_PERIOD)
3470         {
3471 #ifdef RE_ENABLE_I18N
3472           if (dfa->mb_cur_max > 1)
3473             bitset_merge (accepts, dfa->sb_char);
3474           else
3475 #endif
3476             bitset_set_all (accepts);
3477           if (!(dfa->syntax & REG_DOT_NEWLINE))
3478             bitset_clear (accepts, '\n');
3479           if (dfa->syntax & REG_DOT_NOT_NULL)
3480             bitset_clear (accepts, '\0');
3481         }
3482 #ifdef RE_ENABLE_I18N
3483       else if (type == OP_UTF8_PERIOD)
3484         {
3485           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3486           if (!(dfa->syntax & REG_DOT_NEWLINE))
3487             bitset_clear (accepts, '\n');
3488           if (dfa->syntax & REG_DOT_NOT_NULL)
3489             bitset_clear (accepts, '\0');
3490         }
3491 #endif
3492       else
3493         continue;
3494
3495       /* Check the `accepts' and sift the characters which are not
3496          match it the context.  */
3497       if (constraint)
3498         {
3499           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3500             {
3501               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3502               bitset_empty (accepts);
3503               if (accepts_newline)
3504                 bitset_set (accepts, NEWLINE_CHAR);
3505               else
3506                 continue;
3507             }
3508           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3509             {
3510               bitset_empty (accepts);
3511               continue;
3512             }
3513
3514           if (constraint & NEXT_WORD_CONSTRAINT)
3515             {
3516               unsigned int any_set = 0;
3517               if (type == CHARACTER && !node->word_char)
3518                 {
3519                   bitset_empty (accepts);
3520                   continue;
3521                 }
3522 #ifdef RE_ENABLE_I18N
3523               if (dfa->mb_cur_max > 1)
3524                 for (j = 0; j < BITSET_UINTS; ++j)
3525                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3526               else
3527 #endif
3528                 for (j = 0; j < BITSET_UINTS; ++j)
3529                   any_set |= (accepts[j] &= dfa->word_char[j]);
3530               if (!any_set)
3531                 continue;
3532             }
3533           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3534             {
3535               unsigned int any_set = 0;
3536               if (type == CHARACTER && node->word_char)
3537                 {
3538                   bitset_empty (accepts);
3539                   continue;
3540                 }
3541 #ifdef RE_ENABLE_I18N
3542               if (dfa->mb_cur_max > 1)
3543                 for (j = 0; j < BITSET_UINTS; ++j)
3544                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3545               else
3546 #endif
3547                 for (j = 0; j < BITSET_UINTS; ++j)
3548                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3549               if (!any_set)
3550                 continue;
3551             }
3552         }
3553
3554       /* Then divide `accepts' into DFA states, or create a new
3555          state.  Above, we make sure that accepts is not empty.  */
3556       for (j = 0; j < ndests; ++j)
3557         {
3558           bitset intersec; /* Intersection sets, see below.  */
3559           bitset remains;
3560           /* Flags, see below.  */
3561           int has_intersec, not_subset, not_consumed;
3562
3563           /* Optimization, skip if this state doesn't accept the character.  */
3564           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3565             continue;
3566
3567           /* Enumerate the intersection set of this state and `accepts'.  */
3568           has_intersec = 0;
3569           for (k = 0; k < BITSET_UINTS; ++k)
3570             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3571           /* And skip if the intersection set is empty.  */
3572           if (!has_intersec)
3573             continue;
3574
3575           /* Then check if this state is a subset of `accepts'.  */
3576           not_subset = not_consumed = 0;
3577           for (k = 0; k < BITSET_UINTS; ++k)
3578             {
3579               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3580               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3581             }
3582
3583           /* If this state isn't a subset of `accepts', create a
3584              new group state, which has the `remains'. */
3585           if (not_subset)
3586             {
3587               bitset_copy (dests_ch[ndests], remains);
3588               bitset_copy (dests_ch[j], intersec);
3589               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3590               if (BE (err != REG_NOERROR, 0))
3591                 goto error_return;
3592               ++ndests;
3593             }
3594
3595           /* Put the position in the current group. */
3596           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3597           if (BE (result < 0, 0))
3598             goto error_return;
3599
3600           /* If all characters are consumed, go to next node. */
3601           if (!not_consumed)
3602             break;
3603         }
3604       /* Some characters remain, create a new group. */
3605       if (j == ndests)
3606         {
3607           bitset_copy (dests_ch[ndests], accepts);
3608           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3609           if (BE (err != REG_NOERROR, 0))
3610             goto error_return;
3611           ++ndests;
3612           bitset_empty (accepts);
3613         }
3614     }
3615   return ndests;
3616  error_return:
3617   for (j = 0; j < ndests; ++j)
3618     re_node_set_free (dests_node + j);
3619   return -1;
3620 }
3621
3622 #ifdef RE_ENABLE_I18N
3623 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3624    Return the number of the bytes the node accepts.
3625    STR_IDX is the current index of the input string.
3626
3627    This function handles the nodes which can accept one character, or
3628    one collating element like '.', '[a-z]', opposite to the other nodes
3629    can only accept one byte.  */
3630
3631 static int
3632 internal_function
3633 check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3634                          const re_string_t *input, int str_idx)
3635 {
3636   const re_token_t *node = dfa->nodes + node_idx;
3637   int char_len, elem_len;
3638   int i;
3639
3640   if (BE (node->type == OP_UTF8_PERIOD, 0))
3641     {
3642       unsigned char c = re_string_byte_at (input, str_idx), d;
3643       if (BE (c < 0xc2, 1))
3644         return 0;
3645
3646       if (str_idx + 2 > input->len)
3647         return 0;
3648
3649       d = re_string_byte_at (input, str_idx + 1);
3650       if (c < 0xe0)
3651         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3652       else if (c < 0xf0)
3653         {
3654           char_len = 3;
3655           if (c == 0xe0 && d < 0xa0)
3656             return 0;
3657         }
3658       else if (c < 0xf8)
3659         {
3660           char_len = 4;
3661           if (c == 0xf0 && d < 0x90)
3662             return 0;
3663         }
3664       else if (c < 0xfc)
3665         {
3666           char_len = 5;
3667           if (c == 0xf8 && d < 0x88)
3668             return 0;
3669         }
3670       else if (c < 0xfe)
3671         {
3672           char_len = 6;
3673           if (c == 0xfc && d < 0x84)
3674             return 0;
3675         }
3676       else
3677         return 0;
3678
3679       if (str_idx + char_len > input->len)
3680         return 0;
3681
3682       for (i = 1; i < char_len; ++i)
3683         {
3684           d = re_string_byte_at (input, str_idx + i);
3685           if (d < 0x80 || d > 0xbf)
3686             return 0;
3687         }
3688       return char_len;
3689     }
3690
3691   char_len = re_string_char_size_at (input, str_idx);
3692   if (node->type == OP_PERIOD)
3693     {
3694       if (char_len <= 1)
3695         return 0;
3696       /* FIXME: I don't think this if is needed, as both '\n'
3697          and '\0' are char_len == 1.  */
3698       /* '.' accepts any one character except the following two cases.  */
3699       if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3700            re_string_byte_at (input, str_idx) == '\n') ||
3701           ((dfa->syntax & REG_DOT_NOT_NULL) &&
3702            re_string_byte_at (input, str_idx) == '\0'))
3703         return 0;
3704       return char_len;
3705     }
3706
3707   elem_len = re_string_elem_size_at (input, str_idx);
3708   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3709     return 0;
3710
3711   if (node->type == COMPLEX_BRACKET)
3712     {
3713       const re_charset_t *cset = node->opr.mbcset;
3714 # ifdef _LIBC
3715       const unsigned char *pin
3716         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3717       int j;
3718       uint32_t nrules;
3719 # endif /* _LIBC */
3720       int match_len = 0;
3721       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3722                     ? re_string_wchar_at (input, str_idx) : 0);
3723
3724       /* match with multibyte character?  */
3725       for (i = 0; i < cset->nmbchars; ++i)
3726         if (wc == cset->mbchars[i])
3727           {
3728             match_len = char_len;
3729             goto check_node_accept_bytes_match;
3730           }
3731       /* match with character_class?  */
3732       for (i = 0; i < cset->nchar_classes; ++i)
3733         {
3734           wctype_t wt = cset->char_classes[i];
3735           if (__iswctype (wc, wt))
3736             {
3737               match_len = char_len;
3738               goto check_node_accept_bytes_match;
3739             }
3740         }
3741
3742 # ifdef _LIBC
3743       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3744       if (nrules != 0)
3745         {
3746           unsigned int in_collseq = 0;
3747           const int32_t *table, *indirect;
3748           const unsigned char *weights, *extra;
3749           const char *collseqwc;
3750           int32_t idx;
3751           /* This #include defines a local function!  */
3752 #  include <locale/weight.h>
3753
3754           /* match with collating_symbol?  */
3755           if (cset->ncoll_syms)
3756             extra = (const unsigned char *)
3757               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3758           for (i = 0; i < cset->ncoll_syms; ++i)
3759             {
3760               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3761               /* Compare the length of input collating element and
3762                  the length of current collating element.  */
3763               if (*coll_sym != elem_len)
3764                 continue;
3765               /* Compare each bytes.  */
3766               for (j = 0; j < *coll_sym; j++)
3767                 if (pin[j] != coll_sym[1 + j])
3768                   break;
3769               if (j == *coll_sym)
3770                 {
3771                   /* Match if every bytes is equal.  */
3772                   match_len = j;
3773                   goto check_node_accept_bytes_match;
3774                 }
3775             }
3776
3777           if (cset->nranges)
3778             {
3779               if (elem_len <= char_len)
3780                 {
3781                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3782                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3783                 }
3784               else
3785                 in_collseq = find_collation_sequence_value (pin, elem_len);
3786             }
3787           /* match with range expression?  */
3788           for (i = 0; i < cset->nranges; ++i)
3789             if (cset->range_starts[i] <= in_collseq
3790                 && in_collseq <= cset->range_ends[i])
3791               {
3792                 match_len = elem_len;
3793                 goto check_node_accept_bytes_match;
3794               }
3795
3796           /* match with equivalence_class?  */
3797           if (cset->nequiv_classes)
3798             {
3799               const unsigned char *cp = pin;
3800               table = (const int32_t *)
3801                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3802               weights = (const unsigned char *)
3803                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3804               extra = (const unsigned char *)
3805                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3806               indirect = (const int32_t *)
3807                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3808               idx = findidx (&cp);
3809               if (idx > 0)
3810                 for (i = 0; i < cset->nequiv_classes; ++i)
3811                   {
3812                     int32_t equiv_class_idx = cset->equiv_classes[i];
3813                     size_t weight_len = weights[idx];
3814                     if (weight_len == weights[equiv_class_idx])
3815                       {
3816                         int cnt = 0;
3817                         while (cnt <= weight_len
3818                                && (weights[equiv_class_idx + 1 + cnt]
3819                                    == weights[idx + 1 + cnt]))
3820                           ++cnt;
3821                         if (cnt > weight_len)
3822                           {
3823                             match_len = elem_len;
3824                             goto check_node_accept_bytes_match;
3825                           }
3826                       }
3827                   }
3828             }
3829         }
3830       else
3831 # endif /* _LIBC */
3832         {
3833           /* match with range expression?  */
3834 #if __GNUC__ >= 2
3835           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3836 #else
3837           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3838           cmp_buf[2] = wc;
3839 #endif
3840           for (i = 0; i < cset->nranges; ++i)
3841             {
3842               cmp_buf[0] = cset->range_starts[i];
3843               cmp_buf[4] = cset->range_ends[i];
3844               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3845                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3846                 {
3847                   match_len = char_len;
3848                   goto check_node_accept_bytes_match;
3849                 }
3850             }
3851         }
3852     check_node_accept_bytes_match:
3853       if (!cset->non_match)
3854         return match_len;
3855       else
3856         {
3857           if (match_len > 0)
3858             return 0;
3859           else
3860             return (elem_len > char_len) ? elem_len : char_len;
3861         }
3862     }
3863   return 0;
3864 }
3865
3866 # ifdef _LIBC
3867 static unsigned int
3868 find_collation_sequence_value (mbs, mbs_len)
3869     const unsigned char *mbs;
3870     size_t mbs_len;
3871 {
3872   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3873   if (nrules == 0)
3874     {
3875       if (mbs_len == 1)
3876         {
3877           /* No valid character.  Match it as a single byte character.  */
3878           const unsigned char *collseq = (const unsigned char *)
3879             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3880           return collseq[mbs[0]];
3881         }
3882       return UINT_MAX;
3883     }
3884   else
3885     {
3886       int32_t idx;
3887       const unsigned char *extra = (const unsigned char *)
3888         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3889       int32_t extrasize = (const unsigned char *)
3890         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3891
3892       for (idx = 0; idx < extrasize;)
3893         {
3894           int mbs_cnt, found = 0;
3895           int32_t elem_mbs_len;
3896           /* Skip the name of collating element name.  */
3897           idx = idx + extra[idx] + 1;
3898           elem_mbs_len = extra[idx++];
3899           if (mbs_len == elem_mbs_len)
3900             {
3901               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3902                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3903                   break;
3904               if (mbs_cnt == elem_mbs_len)
3905                 /* Found the entry.  */
3906                 found = 1;
3907             }
3908           /* Skip the byte sequence of the collating element.  */
3909           idx += elem_mbs_len;
3910           /* Adjust for the alignment.  */
3911           idx = (idx + 3) & ~3;
3912           /* Skip the collation sequence value.  */
3913           idx += sizeof (uint32_t);
3914           /* Skip the wide char sequence of the collating element.  */
3915           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3916           /* If we found the entry, return the sequence value.  */
3917           if (found)
3918             return *(uint32_t *) (extra + idx);
3919           /* Skip the collation sequence value.  */
3920           idx += sizeof (uint32_t);
3921         }
3922       return UINT_MAX;
3923     }
3924 }
3925 # endif /* _LIBC */
3926 #endif /* RE_ENABLE_I18N */
3927
3928 /* Check whether the node accepts the byte which is IDX-th
3929    byte of the INPUT.  */
3930
3931 static int
3932 internal_function
3933 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3934                    int idx)
3935 {
3936   unsigned char ch;
3937   ch = re_string_byte_at (&mctx->input, idx);
3938   switch (node->type)
3939     {
3940     case CHARACTER:
3941       if (node->opr.c != ch)
3942         return 0;
3943       break;
3944
3945     case SIMPLE_BRACKET:
3946       if (!bitset_contain (node->opr.sbcset, ch))
3947         return 0;
3948       break;
3949
3950 #ifdef RE_ENABLE_I18N
3951     case OP_UTF8_PERIOD:
3952       if (ch >= 0x80)
3953         return 0;
3954       /* FALLTHROUGH */
3955 #endif
3956     case OP_PERIOD:
3957       if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
3958           || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
3959         return 0;
3960       break;
3961
3962     default:
3963       return 0;
3964     }
3965
3966   if (node->constraint)
3967     {
3968       /* The node has constraints.  Check whether the current context
3969          satisfies the constraints.  */
3970       unsigned int context = re_string_context_at (&mctx->input, idx,
3971                                                    mctx->eflags);
3972       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3973         return 0;
3974     }
3975
3976   return 1;
3977 }
3978
3979 /* Extend the buffers, if the buffers have run out.  */
3980
3981 static reg_errcode_t
3982 internal_function
3983 extend_buffers (re_match_context_t *mctx)
3984 {
3985   reg_errcode_t ret;
3986   re_string_t *pstr = &mctx->input;
3987
3988   /* Double the lengthes of the buffers.  */
3989   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3990   if (BE (ret != REG_NOERROR, 0))
3991     return ret;
3992
3993   if (mctx->state_log != NULL)
3994     {
3995       /* And double the length of state_log.  */
3996       /* XXX We have no indication of the size of this buffer.  If this
3997          allocation fail we have no indication that the state_log array
3998          does not have the right size.  */
3999       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4000                                               pstr->bufs_len + 1);
4001       if (BE (new_array == NULL, 0))
4002         return REG_ESPACE;
4003       mctx->state_log = new_array;
4004     }
4005
4006   /* Then reconstruct the buffers.  */
4007   if (pstr->icase)
4008     {
4009 #ifdef RE_ENABLE_I18N
4010       if (pstr->mb_cur_max > 1)
4011         {
4012           ret = build_wcs_upper_buffer (pstr);
4013           if (BE (ret != REG_NOERROR, 0))
4014             return ret;
4015         }
4016       else
4017 #endif /* RE_ENABLE_I18N  */
4018         build_upper_buffer (pstr);
4019     }
4020   else
4021     {
4022 #ifdef RE_ENABLE_I18N
4023       if (pstr->mb_cur_max > 1)
4024         build_wcs_buffer (pstr);
4025       else
4026 #endif /* RE_ENABLE_I18N  */
4027         {
4028           if (pstr->trans != NULL)
4029             re_string_translate_buffer (pstr);
4030         }
4031     }
4032   return REG_NOERROR;
4033 }
4034
4035 \f
4036 /* Functions for matching context.  */
4037
4038 /* Initialize MCTX.  */
4039
4040 static reg_errcode_t
4041 internal_function
4042 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4043 {
4044   mctx->eflags = eflags;
4045   mctx->match_last = -1;
4046   if (n > 0)
4047     {
4048       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4049       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4050       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4051         return REG_ESPACE;
4052     }
4053   /* Already zero-ed by the caller.
4054      else
4055        mctx->bkref_ents = NULL;
4056      mctx->nbkref_ents = 0;
4057      mctx->nsub_tops = 0;  */
4058   mctx->abkref_ents = n;
4059   mctx->max_mb_elem_len = 1;
4060   mctx->asub_tops = n;
4061   return REG_NOERROR;
4062 }
4063
4064 /* Clean the entries which depend on the current input in MCTX.
4065    This function must be invoked when the matcher changes the start index
4066    of the input, or changes the input string.  */
4067
4068 static void
4069 internal_function
4070 match_ctx_clean (re_match_context_t *mctx)
4071 {
4072   int st_idx;
4073   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4074     {
4075       int sl_idx;
4076       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4077       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4078         {
4079           re_sub_match_last_t *last = top->lasts[sl_idx];
4080           re_free (last->path.array);
4081           re_free (last);
4082         }
4083       re_free (top->lasts);
4084       if (top->path)
4085         {
4086           re_free (top->path->array);
4087           re_free (top->path);
4088         }
4089       free (top);
4090     }
4091
4092   mctx->nsub_tops = 0;
4093   mctx->nbkref_ents = 0;
4094 }
4095
4096 /* Free all the memory associated with MCTX.  */
4097
4098 static void
4099 internal_function
4100 match_ctx_free (re_match_context_t *mctx)
4101 {
4102   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4103   match_ctx_clean (mctx);
4104   re_free (mctx->sub_tops);
4105   re_free (mctx->bkref_ents);
4106 }
4107
4108 /* Add a new backreference entry to MCTX.
4109    Note that we assume that caller never call this function with duplicate
4110    entry, and call with STR_IDX which isn't smaller than any existing entry.
4111 */
4112
4113 static reg_errcode_t
4114 internal_function
4115 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx,
4116                      int from, int to)
4117 {
4118   if (mctx->nbkref_ents >= mctx->abkref_ents)
4119     {
4120       struct re_backref_cache_entry* new_entry;
4121       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4122                               mctx->abkref_ents * 2);
4123       if (BE (new_entry == NULL, 0))
4124         {
4125           re_free (mctx->bkref_ents);
4126           return REG_ESPACE;
4127         }
4128       mctx->bkref_ents = new_entry;
4129       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4130               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4131       mctx->abkref_ents *= 2;
4132     }
4133   if (mctx->nbkref_ents > 0
4134       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4135     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4136
4137   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4138   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4139   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4140   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4141
4142   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4143      If bit N is clear, means that this entry won't epsilon-transition to
4144      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4145      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4146      such node.
4147
4148      A backreference does not epsilon-transition unless it is empty, so set
4149      to all zeros if FROM != TO.  */
4150   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4151     = (from == to ? ~0 : 0);
4152
4153   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4154   if (mctx->max_mb_elem_len < to - from)
4155     mctx->max_mb_elem_len = to - from;
4156   return REG_NOERROR;
4157 }
4158
4159 /* Search for the first entry which has the same str_idx, or -1 if none is
4160    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4161
4162 static int
4163 internal_function
4164 search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
4165 {
4166   int left, right, mid, last;
4167   last = right = mctx->nbkref_ents;
4168   for (left = 0; left < right;)
4169     {
4170       mid = (left + right) / 2;
4171       if (mctx->bkref_ents[mid].str_idx < str_idx)
4172         left = mid + 1;
4173       else
4174         right = mid;
4175     }
4176   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4177     return left;
4178   else
4179     return -1;
4180 }
4181
4182 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4183    at STR_IDX.  */
4184
4185 static reg_errcode_t
4186 internal_function
4187 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4188 {
4189 #ifdef DEBUG
4190   assert (mctx->sub_tops != NULL);
4191   assert (mctx->asub_tops > 0);
4192 #endif
4193   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4194     {
4195       int new_asub_tops = mctx->asub_tops * 2;
4196       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4197                                                    re_sub_match_top_t *,
4198                                                    new_asub_tops);
4199       if (BE (new_array == NULL, 0))
4200         return REG_ESPACE;
4201       mctx->sub_tops = new_array;
4202       mctx->asub_tops = new_asub_tops;
4203     }
4204   mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4205   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4206     return REG_ESPACE;
4207   mctx->sub_tops[mctx->nsub_tops]->node = node;
4208   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4209   return REG_NOERROR;
4210 }
4211
4212 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4213    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4214
4215 static re_sub_match_last_t *
4216 internal_function
4217 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4218 {
4219   re_sub_match_last_t *new_entry;
4220   if (BE (subtop->nlasts == subtop->alasts, 0))
4221     {
4222       int new_alasts = 2 * subtop->alasts + 1;
4223       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4224                                                     re_sub_match_last_t *,
4225                                                     new_alasts);
4226       if (BE (new_array == NULL, 0))
4227         return NULL;
4228       subtop->lasts = new_array;
4229       subtop->alasts = new_alasts;
4230     }
4231   new_entry = re_calloc (re_sub_match_last_t, 1);
4232   if (BE (new_entry != NULL, 1))
4233     {
4234       subtop->lasts[subtop->nlasts] = new_entry;
4235       new_entry->node = node;
4236       new_entry->str_idx = str_idx;
4237       ++subtop->nlasts;
4238     }
4239   return new_entry;
4240 }
4241
4242 static void
4243 internal_function
4244 sift_ctx_init (re_sift_context_t *sctx,
4245                re_dfastate_t **sifted_sts,
4246                re_dfastate_t **limited_sts,
4247                int last_node, int last_str_idx)
4248 {
4249   sctx->sifted_states = sifted_sts;
4250   sctx->limited_states = limited_sts;
4251   sctx->last_node = last_node;
4252   sctx->last_str_idx = last_str_idx;
4253   re_node_set_init_empty (&sctx->limits);
4254 }