lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first)
  58      internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes) internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  74
  75 #ifdef RE_ENABLE_I18N
  76 static int sift_states_iter_mb (const re_match_context_t *mctx,
  77                                 re_sift_context_t *sctx,
  78                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  79 #endif /* RE_ENABLE_I18N */
  80 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  81                                            re_sift_context_t *sctx) internal_function;
  82 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  83                                           re_sift_context_t *sctx, int str_idx,
  84                                           re_node_set *cur_dest) internal_function;
  85 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  86                                               re_sift_context_t *sctx,
  87                                               int str_idx,
  88                                               re_node_set *dest_nodes) internal_function;
  89 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
  90                                             re_node_set *dest_nodes,
  91                                             const re_node_set *candidates) internal_function;
  92 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
  93                              int dst_node, int dst_idx, int src_node,
  94                              int src_idx) internal_function;
  95 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
  96                                         int boundaries, int subexp_idx,
  97                                         int from_node, int bkref_idx) internal_function;
  98 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
  99                                       int limit, int subexp_idx,
 100                                       int node, int str_idx,
 101                                       int bkref_idx) internal_function;
 102 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 103                                           re_node_set *dest_nodes,
 104                                           const re_node_set *candidates,
 105                                           re_node_set *limits,
 106                                           struct re_backref_cache_entry *bkref_ents,
 107                                           int str_idx) internal_function;
 108 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 109                                         re_sift_context_t *sctx,
 110                                         int str_idx, const re_node_set *candidates) internal_function;
 111 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 112                                         re_dfastate_t **src, int num) internal_function;
 113 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 114                                          re_match_context_t *mctx) internal_function;
 115 static re_dfastate_t *transit_state (reg_errcode_t *err,
 116                                      re_match_context_t *mctx,
 117                                      re_dfastate_t *state) internal_function;
 118 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 119                                             re_match_context_t *mctx,
 120                                             re_dfastate_t *next_state) internal_function;
 121 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 122                                                 re_node_set *cur_nodes,
 123                                                 int str_idx) internal_function;
 124 #if 0
 125 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 126                                         re_match_context_t *mctx,
 127                                         re_dfastate_t *pstate) internal_function;
 128 #endif
 129 #ifdef RE_ENABLE_I18N
 130 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 131                                        re_dfastate_t *pstate) internal_function;
 132 #endif /* RE_ENABLE_I18N */
 133 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 134                                           const re_node_set *nodes) internal_function;
 135 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 136                                  int bkref_node, int bkref_str_idx) internal_function;
 137 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 138                                      const re_sub_match_top_t *sub_top,
 139                                      re_sub_match_last_t *sub_last,
 140                                      int bkref_node, int bkref_str) internal_function;
 141 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 142                              int subexp_idx, int type) internal_function;
 143 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 144                                     state_array_t *path, int top_node,
 145                                     int top_str, int last_node, int last_str,
 146                                     int type) internal_function;
 147 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 148                                                    int str_idx,
 149                                                    re_node_set *cur_nodes,
 150                                                    re_node_set *next_nodes) internal_function;
 151 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 152                                                re_node_set *cur_nodes,
 153                                                int ex_subexp, int type) internal_function;
 154 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 155                                                    re_node_set *dst_nodes,
 156                                                    int target, int ex_subexp,
 157                                                    int type) internal_function;
 158 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 159                                          re_node_set *cur_nodes, int cur_str,
 160                                          int subexp_num, int type) internal_function;
 161 static int build_trtable (re_dfa_t *dfa,
 162                           re_dfastate_t *state) internal_function;
 163 #ifdef RE_ENABLE_I18N
 164 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 165                                     const re_string_t *input, int idx) internal_function;
 166 # ifdef _LIBC
 167 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 168                                                    size_t name_len) internal_function;
 169 # endif /* _LIBC */
 170 #endif /* RE_ENABLE_I18N */
 171 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 172                                        const re_dfastate_t *state,
 173                                        re_node_set *states_node,
 174                                        bitset *states_ch) internal_function;
 175 static int check_node_accept (const re_match_context_t *mctx,
 176                               const re_token_t *node, int idx) internal_function;
 177 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 178 \f
 179 /* Entry point for POSIX code.  */
 180
 181 /* regexec searches for a given pattern, specified by PREG, in the
 182    string STRING.
 183
 184    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 185    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 186    least NMATCH elements, and we set them to the offsets of the
 187    corresponding matched substrings.
 188
 189    EFLAGS specifies `execution flags' which affect matching: if
 190    REG_NOTBOL is set, then ^ does not match at the beginning of the
 191    string; if REG_NOTEOL is set, then $ does not match at the end.
 192
 193    We return 0 if we find a match and REG_NOMATCH if not.  */
 194
 195 int
 196 regexec (const regex_t *__restrict preg, const char *__restrict string,
 197          size_t nmatch, regmatch_t pmatch[], int eflags)
 198 {
 199   reg_errcode_t err;
 200   int start, length;
 201 #ifdef _LIBC
 202   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 203 #endif
 204
 205   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 206     return REG_BADPAT;
 207
 208   if (eflags & REG_STARTEND)
 209     {
 210       start = pmatch[0].rm_so;
 211       length = pmatch[0].rm_eo;
 212     }
 213   else
 214     {
 215       start = 0;
 216       length = strlen (string);
 217     }
 218
 219   __libc_lock_lock (dfa->lock);
 220   if (preg->re_no_sub)
 221     err = re_search_internal (preg, string, length, start, length - start,
 222                               length, 0, NULL, eflags);
 223   else
 224     err = re_search_internal (preg, string, length, start, length - start,
 225                               length, nmatch, pmatch, eflags);
 226   __libc_lock_unlock (dfa->lock);
 227   return err != REG_NOERROR;
 228 }
 229
 230 #ifdef _LIBC
 231 # include <shlib-compat.h>
 232 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 233
 234 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 235 __typeof__ (__regexec) __compat_regexec;
 236
 237 int
 238 attribute_compat_text_section
 239 __compat_regexec (const regex_t *__restrict preg,
 240                   const char *__restrict string, size_t nmatch,
 241                   regmatch_t pmatch[], int eflags)
 242 {
 243   return regexec (preg, string, nmatch, pmatch,
 244                   eflags & (REG_NOTBOL | REG_NOTEOL));
 245 }
 246 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 247 # endif
 248 #endif
 249
 250 /* Entry points for GNU code.  */
 251
 252 /* re_match, re_search, re_match_2, re_search_2
 253
 254    The former two functions operate on STRING with length LENGTH,
 255    while the later two operate on concatenation of STRING1 and STRING2
 256    with lengths LENGTH1 and LENGTH2, respectively.
 257
 258    re_match() matches the compiled pattern in BUFP against the string,
 259    starting at index START.
 260
 261    re_search() first tries matching at index START, then it tries to match
 262    starting from index START + 1, and so on.  The last start position tried
 263    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 264    way as re_match().)
 265
 266    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 267    the first STOP characters of the concatenation of the strings should be
 268    concerned.
 269
 270    If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
 271    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 272    computed relative to the concatenation, not relative to the individual
 273    strings.)
 274
 275    On success, re_match* functions return the length of the match, re_search*
 276    return the position of the start of the match.  Return value -1 means no
 277    match was found and -2 indicates an internal error.  */
 278
 279 int
 280 re_match (struct re_pattern_buffer *bufp, const char *string,
 281           int length, int start, struct re_registers *regs)
 282 {
 283   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_match, re_match)
 287 #endif
 288
 289 int
 290 re_search (struct re_pattern_buffer *bufp, const char *string,
 291            int length, int start, int range, struct re_registers *regs)
 292 {
 293   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 294 }
 295 #ifdef _LIBC
 296 weak_alias (__re_search, re_search)
 297 #endif
 298
 299 int
 300 re_match_2 (struct re_pattern_buffer *bufp,
 301             const char *string1, int length1,
 302             const char *string2, int length2,
 303             int start, struct re_registers *regs, int stop)
 304 {
 305   return re_search_2_stub (bufp, string1, length1, string2, length2,
 306                            start, 0, regs, stop, 1);
 307 }
 308 #ifdef _LIBC
 309 weak_alias (__re_match_2, re_match_2)
 310 #endif
 311
 312 int
 313 re_search_2 (struct re_pattern_buffer *bufp,
 314              const char *string1, int length1,
 315              const char *string2, int length2,
 316              int start, int range, struct re_registers *regs, int stop)
 317 {
 318   return re_search_2_stub (bufp, string1, length1, string2, length2,
 319                            start, range, regs, stop, 0);
 320 }
 321 #ifdef _LIBC
 322 weak_alias (__re_search_2, re_search_2)
 323 #endif
 324
 325 static int
 326 internal_function
 327 re_search_2_stub (struct re_pattern_buffer *bufp,
 328                   const char *string1, int length1,
 329                   const char *string2, int length2,
 330                   int start, int range, struct re_registers *regs, int stop,
 331                   int ret_len)
 332 {
 333   const char *str;
 334   int rval;
 335   int len = length1 + length2;
 336   int free_str = 0;
 337
 338   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 339     return -2;
 340
 341   /* Concatenate the strings.  */
 342   if (length2 > 0)
 343     if (length1 > 0)
 344       {
 345         char *s = re_malloc (char, len);
 346
 347         if (BE (s == NULL, 0))
 348           return -2;
 349         memcpy (s, string1, length1);
 350         memcpy (s + length1, string2, length2);
 351         str = s;
 352         free_str = 1;
 353       }
 354     else
 355       str = string2;
 356   else
 357     str = string1;
 358
 359   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 360                          ret_len);
 361   if (free_str)
 362     re_free ((char *) str);
 363   return rval;
 364 }
 365
 366 /* The parameters have the same meaning as those of re_search.
 367    Additional parameters:
 368    If RET_LEN is nonzero the length of the match is returned (re_match style);
 369    otherwise the position of the match is returned.  */
 370
 371 static int
 372 internal_function
 373 re_search_stub (struct re_pattern_buffer *bufp,
 374                 const char *string, int length,
 375                 int start, int range, int stop, struct re_registers *regs,
 376                 int ret_len)
 377 {
 378   reg_errcode_t result;
 379   regmatch_t *pmatch;
 380   int nregs, rval;
 381   int eflags = 0;
 382 #ifdef _LIBC
 383   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 384 #endif
 385
 386   /* Check for out-of-range.  */
 387   if (BE (start < 0 || start > length, 0))
 388     return -1;
 389   if (BE (start + range > length, 0))
 390     range = length - start;
 391   else if (BE (start + range < 0, 0))
 392     range = -start;
 393
 394   __libc_lock_lock (dfa->lock);
 395
 396   eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
 397   eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
 398
 399   /* Compile fastmap if we haven't yet.  */
 400   if (range > 0 && bufp->re_fastmap != NULL && !bufp->re_fastmap_accurate)
 401     re_compile_fastmap (bufp);
 402
 403   if (BE (bufp->re_no_sub, 0))
 404     regs = NULL;
 405
 406   /* We need at least 1 register.  */
 407   if (regs == NULL)
 408     nregs = 1;
 409   else if (BE (bufp->re_regs_allocated == REG_FIXED
 410                && regs->rm_num_regs < bufp->re_nsub + 1, 0))
 411     {
 412       nregs = regs->rm_num_regs;
 413       if (BE (nregs < 1, 0))
 414         {
 415           /* Nothing can be copied to regs.  */
 416           regs = NULL;
 417           nregs = 1;
 418         }
 419     }
 420   else
 421     nregs = bufp->re_nsub + 1;
 422   pmatch = re_malloc (regmatch_t, nregs);
 423   if (BE (pmatch == NULL, 0))
 424     {
 425       rval = -2;
 426       goto out;
 427     }
 428
 429   result = re_search_internal (bufp, string, length, start, range, stop,
 430                                nregs, pmatch, eflags);
 431
 432   rval = 0;
 433
 434   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 435   if (result != REG_NOERROR)
 436     rval = -1;
 437   else if (regs != NULL)
 438     {
 439       /* If caller wants register contents data back, copy them.  */
 440       bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
 441                                               bufp->re_regs_allocated);
 442       if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
 443         rval = -2;
 444     }
 445
 446   if (BE (rval == 0, 1))
 447     {
 448       if (ret_len)
 449         {
 450           assert (pmatch[0].rm_so == start);
 451           rval = pmatch[0].rm_eo - start;
 452         }
 453       else
 454         rval = pmatch[0].rm_so;
 455     }
 456   re_free (pmatch);
 457  out:
 458   __libc_lock_unlock (dfa->lock);
 459   return rval;
 460 }
 461
 462 static unsigned
 463 internal_function
 464 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
 465               int regs_allocated)
 466 {
 467   int rval = REG_REALLOCATE;
 468   int i;
 469   int need_regs = nregs + 1;
 470   /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
 471      uses.  */
 472
 473   /* Have the register data arrays been allocated?  */
 474   if (regs_allocated == REG_UNALLOCATED)
 475     { /* No.  So allocate them with malloc.  */
 476       regs->rm_start = re_malloc (regoff_t, need_regs);
 477       regs->rm_end = re_malloc (regoff_t, need_regs);
 478       if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
 479         return REG_UNALLOCATED;
 480       regs->rm_num_regs = need_regs;
 481     }
 482   else if (regs_allocated == REG_REALLOCATE)
 483     { /* Yes.  If we need more elements than were already
 484          allocated, reallocate them.  If we need fewer, just
 485          leave it alone.  */
 486       if (BE (need_regs > regs->rm_num_regs, 0))
 487         {
 488           regoff_t *new_start =
 489             re_realloc (regs->rm_start, regoff_t, need_regs);
 490           regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
 491           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 492             return REG_UNALLOCATED;
 493           regs->rm_start = new_start;
 494           regs->rm_end = new_end;
 495           regs->rm_num_regs = need_regs;
 496         }
 497     }
 498   else
 499     {
 500       assert (regs_allocated == REG_FIXED);
 501       /* This function may not be called with REG_FIXED and nregs too big.  */
 502       assert (regs->rm_num_regs >= nregs);
 503       rval = REG_FIXED;
 504     }
 505
 506   /* Copy the regs.  */
 507   for (i = 0; i < nregs; ++i)
 508     {
 509       regs->rm_start[i] = pmatch[i].rm_so;
 510       regs->rm_end[i] = pmatch[i].rm_eo;
 511     }
 512   for ( ; i < regs->rm_num_regs; ++i)
 513     regs->rm_start[i] = regs->rm_end[i] = -1;
 514
 515   return rval;
 516 }
 517
 518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 519    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 520    this memory for recording register information.  STARTS and ENDS
 521    must be allocated using the malloc library routine, and must each
 522    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 523
 524    If NUM_REGS == 0, then subsequent matches should allocate their own
 525    register data.
 526
 527    Unless this function is called, the first search or match using
 528    PATTERN_BUFFER will allocate its own register data, without
 529    freeing the old data.  */
 530
 531 void
 532 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 533                   unsigned int num_regs, regoff_t *starts, regoff_t *ends)
 534 {
 535   if (num_regs)
 536     {
 537       bufp->re_regs_allocated = REG_REALLOCATE;
 538       regs->rm_num_regs = num_regs;
 539       regs->rm_start = starts;
 540       regs->rm_end = ends;
 541     }
 542   else
 543     {
 544       bufp->re_regs_allocated = REG_UNALLOCATED;
 545       regs->rm_num_regs = 0;
 546       regs->rm_start = regs->rm_end = NULL;
 547     }
 548 }
 549 #ifdef _LIBC
 550 weak_alias (__re_set_registers, re_set_registers)
 551 #endif
 552 \f
 553 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 554    them unless specifically requested.  */
 555
 556 #if defined _REGEX_RE_COMP || defined _LIBC
 557 int
 558 # ifdef _LIBC
 559 weak_function
 560 # endif
 561 re_exec (s)
 562      const char *s;
 563 {
 564   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 565 }
 566 #endif /* _REGEX_RE_COMP */
 567 \f
 568 /* Internal entry point.  */
 569
 570 /* Searches for a compiled pattern PREG in the string STRING, whose
 571    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 572    mingings with regexec.  START, and RANGE have the same meanings
 573    with re_search.
 574    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 575    otherwise return the error code.
 576    Note: We assume front end functions already check ranges.
 577    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 578
 579 static reg_errcode_t
 580 internal_function
 581 re_search_internal (const regex_t *preg,
 582                     const char *string, int length,
 583                     int start, int range, int stop,
 584                     size_t nmatch, regmatch_t pmatch[],
 585                     int eflags)
 586 {
 587   reg_errcode_t err;
 588   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 589   int left_lim, right_lim, incr;
 590   int fl_longest_match, match_first, match_kind, match_last = -1;
 591   int extra_nmatch;
 592   int sb, ch;
 593 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 594   re_match_context_t mctx = { .dfa = dfa };
 595 #else
 596   re_match_context_t mctx;
 597 #endif
 598   char *fastmap = (preg->re_fastmap != NULL && preg->re_fastmap_accurate
 599                    && range && !preg->re_can_be_null) ? preg->re_fastmap : NULL;
 600   unsigned REG_TRANSLATE_TYPE t =
 601     (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
 602
 603 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 604   memset (&mctx, '\0', sizeof (re_match_context_t));
 605   mctx.dfa = dfa;
 606 #endif
 607
 608   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 609   nmatch -= extra_nmatch;
 610
 611   /* Check if the DFA haven't been compiled.  */
 612   if (BE (preg->re_used == 0 || dfa->init_state == NULL
 613           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 614           || dfa->init_state_begbuf == NULL, 0))
 615     return REG_NOMATCH;
 616
 617 #ifdef DEBUG
 618   /* We assume front-end functions already check them.  */
 619   assert (start + range >= 0 && start + range <= length);
 620 #endif
 621
 622   /* If initial states with non-begbuf contexts have no elements,
 623      the regex must be anchored.  If preg->re_newline_anchor is set,
 624      we'll never use init_state_nl, so do not check it.  */
 625   if (dfa->init_state->nodes.nelem == 0
 626       && dfa->init_state_word->nodes.nelem == 0
 627       && (dfa->init_state_nl->nodes.nelem == 0
 628           || !preg->re_newline_anchor))
 629     {
 630       if (start != 0 && start + range != 0)
 631         return REG_NOMATCH;
 632       start = range = 0;
 633     }
 634
 635   /* We must check the longest matching, if nmatch > 0.  */
 636   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 637
 638   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 639                             preg->re_translate,
 640                             preg->re_syntax & REG_IGNORE_CASE, dfa);
 641   if (BE (err != REG_NOERROR, 0))
 642     goto free_return;
 643   mctx.input.stop = stop;
 644   mctx.input.raw_stop = stop;
 645   mctx.input.newline_anchor = preg->re_newline_anchor;
 646
 647   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 648   if (BE (err != REG_NOERROR, 0))
 649     goto free_return;
 650
 651   /* We will log all the DFA states through which the dfa pass,
 652      if nmatch > 1, or this dfa has "multibyte node", which is a
 653      back-reference or a node which can accept multibyte character or
 654      multi character collating element.  */
 655   if (nmatch > 1 || dfa->has_mb_node)
 656     {
 657       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 658       if (BE (mctx.state_log == NULL, 0))
 659         {
 660           err = REG_ESPACE;
 661           goto free_return;
 662         }
 663     }
 664   else
 665     mctx.state_log = NULL;
 666
 667   match_first = start;
 668   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 669                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 670
 671   /* Check incrementally whether of not the input string match.  */
 672   incr = (range < 0) ? -1 : 1;
 673   left_lim = (range < 0) ? start + range : start;
 674   right_lim = (range < 0) ? start : start + range;
 675   sb = dfa->mb_cur_max == 1;
 676   match_kind =
 677     (fastmap
 678      ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
 679         | (range >= 0 ? 2 : 0)
 680         | (t != NULL ? 1 : 0))
 681      : 8);
 682
 683   for (;; match_first += incr)
 684     {
 685       err = REG_NOMATCH;
 686       if (match_first < left_lim || right_lim < match_first)
 687         goto free_return;
 688
 689       /* Advance as rapidly as possible through the string, until we
 690          find a plausible place to start matching.  This may be done
 691          with varying efficiency, so there are various possibilities:
 692          only the most common of them are specialized, in order to
 693          save on code size.  We use a switch statement for speed.  */
 694       switch (match_kind)
 695         {
 696         case 8:
 697           /* No fastmap.  */
 698           break;
 699
 700         case 7:
 701           /* Fastmap with single-byte translation, match forward.  */
 702           while (BE (match_first < right_lim, 1)
 703                  && !fastmap[t[(unsigned char) string[match_first]]])
 704             ++match_first;
 705           goto forward_match_found_start_or_reached_end;
 706
 707         case 6:
 708           /* Fastmap without translation, match forward.  */
 709           while (BE (match_first < right_lim, 1)
 710                  && !fastmap[(unsigned char) string[match_first]])
 711             ++match_first;
 712
 713         forward_match_found_start_or_reached_end:
 714           if (BE (match_first == right_lim, 0))
 715             {
 716               ch = match_first >= length
 717                        ? 0 : (unsigned char) string[match_first];
 718               if (!fastmap[t ? t[ch] : ch])
 719                 goto free_return;
 720             }
 721           break;
 722
 723         case 4:
 724         case 5:
 725           /* Fastmap without multi-byte translation, match backwards.  */
 726           while (match_first >= left_lim)
 727             {
 728               ch = match_first >= length
 729                        ? 0 : (unsigned char) string[match_first];
 730               if (fastmap[t ? t[ch] : ch])
 731                 break;
 732               --match_first;
 733             }
 734           if (match_first < left_lim)
 735             goto free_return;
 736           break;
 737
 738         default:
 739           /* In this case, we can't determine easily the current byte,
 740              since it might be a component byte of a multibyte
 741              character.  Then we use the constructed buffer instead.  */
 742           for (;;)
 743             {
 744               /* If MATCH_FIRST is out of the valid range, reconstruct the
 745                  buffers.  */
 746               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 747               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 748                 {
 749                   err = re_string_reconstruct (&mctx.input, match_first,
 750                                                eflags);
 751                   if (BE (err != REG_NOERROR, 0))
 752                     goto free_return;
 753
 754                   offset = match_first - mctx.input.raw_mbs_idx;
 755                 }
 756               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 757                  Note that MATCH_FIRST must not be smaller than 0.  */
 758               ch = (match_first >= length
 759                     ? 0 : re_string_byte_at (&mctx.input, offset));
 760               if (fastmap[ch])
 761                 break;
 762               match_first += incr;
 763               if (match_first < left_lim || match_first > right_lim)
 764                 {
 765                   err = REG_NOMATCH;
 766                   goto free_return;
 767                 }
 768             }
 769           break;
 770         }
 771
 772       /* Reconstruct the buffers so that the matcher can assume that
 773          the matching starts from the beginning of the buffer.  */
 774       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 775       if (BE (err != REG_NOERROR, 0))
 776         goto free_return;
 777
 778 #ifdef RE_ENABLE_I18N
 779      /* Don't consider this char as a possible match start if it part,
 780         yet isn't the head, of a multibyte character.  */
 781       if (!sb && !re_string_first_byte (&mctx.input, 0))
 782         continue;
 783 #endif
 784
 785       /* It seems to be appropriate one, then use the matcher.  */
 786       /* We assume that the matching starts from 0.  */
 787       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 788       match_last = check_matching (&mctx, fl_longest_match,
 789                                    range >= 0 ? &match_first : NULL);
 790       if (match_last != -1)
 791         {
 792           if (BE (match_last == -2, 0))
 793             {
 794               err = REG_ESPACE;
 795               goto free_return;
 796             }
 797           else
 798             {
 799               mctx.match_last = match_last;
 800               if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
 801                 {
 802                   re_dfastate_t *pstate = mctx.state_log[match_last];
 803                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 804                                                              match_last);
 805                 }
 806               if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
 807                   || dfa->nbackref)
 808                 {
 809                   err = prune_impossible_nodes (&mctx);
 810                   if (err == REG_NOERROR)
 811                     break;
 812                   if (BE (err != REG_NOMATCH, 0))
 813                     goto free_return;
 814                   match_last = -1;
 815                 }
 816               else
 817                 break; /* We found a match.  */
 818             }
 819         }
 820
 821       match_ctx_clean (&mctx);
 822     }
 823
 824 #ifdef DEBUG
 825   assert (match_last != -1);
 826   assert (err == REG_NOERROR);
 827 #endif
 828
 829   /* Set pmatch[] if we need.  */
 830   if (nmatch > 0)
 831     {
 832       int reg_idx;
 833
 834       /* Initialize registers.  */
 835       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 836         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 837
 838       /* Set the points where matching start/end.  */
 839       pmatch[0].rm_so = 0;
 840       pmatch[0].rm_eo = mctx.match_last;
 841
 842       if (!preg->re_no_sub && nmatch > 1)
 843         {
 844           err = set_regs (preg, &mctx, nmatch, pmatch,
 845                           dfa->has_plural_match && dfa->nbackref > 0);
 846           if (BE (err != REG_NOERROR, 0))
 847             goto free_return;
 848         }
 849
 850       /* At last, add the offset to the each registers, since we slided
 851          the buffers so that we could assume that the matching starts
 852          from 0.  */
 853       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 854         if (pmatch[reg_idx].rm_so != -1)
 855           {
 856 #ifdef RE_ENABLE_I18N
 857             if (BE (mctx.input.offsets_needed != 0, 0))
 858               {
 859                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 860                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 861                 else
 862                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 863                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 864                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 865                 else
 866                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 867               }
 868 #else
 869             assert (mctx.input.offsets_needed == 0);
 870 #endif
 871             pmatch[reg_idx].rm_so += match_first;
 872             pmatch[reg_idx].rm_eo += match_first;
 873           }
 874       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 875         {
 876           pmatch[nmatch + reg_idx].rm_so = -1;
 877           pmatch[nmatch + reg_idx].rm_eo = -1;
 878         }
 879
 880       if (dfa->subexp_map)
 881         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 882           if (dfa->subexp_map[reg_idx] != reg_idx)
 883             {
 884               pmatch[reg_idx + 1].rm_so
 885                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 886               pmatch[reg_idx + 1].rm_eo
 887                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 888             }
 889     }
 890
 891  free_return:
 892   re_free (mctx.state_log);
 893   if (dfa->nbackref)
 894     match_ctx_free (&mctx);
 895   re_string_destruct (&mctx.input);
 896   return err;
 897 }
 898
 899 static reg_errcode_t
 900 internal_function
 901 prune_impossible_nodes (re_match_context_t *mctx)
 902 {
 903   re_dfa_t *const dfa = mctx->dfa;
 904   int halt_node, match_last;
 905   reg_errcode_t ret;
 906   re_dfastate_t **sifted_states;
 907   re_dfastate_t **lim_states = NULL;
 908   re_sift_context_t sctx;
 909 #ifdef DEBUG
 910   assert (mctx->state_log != NULL);
 911 #endif
 912   match_last = mctx->match_last;
 913   halt_node = mctx->last_node;
 914   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 915   if (BE (sifted_states == NULL, 0))
 916     {
 917       ret = REG_ESPACE;
 918       goto free_return;
 919     }
 920   if (dfa->nbackref)
 921     {
 922       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 923       if (BE (lim_states == NULL, 0))
 924         {
 925           ret = REG_ESPACE;
 926           goto free_return;
 927         }
 928       while (1)
 929         {
 930           memset (lim_states, '\0',
 931                   sizeof (re_dfastate_t *) * (match_last + 1));
 932           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 933                          match_last);
 934           ret = sift_states_backward (mctx, &sctx);
 935           re_node_set_free (&sctx.limits);
 936           if (BE (ret != REG_NOERROR, 0))
 937               goto free_return;
 938           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 939             break;
 940           do
 941             {
 942               --match_last;
 943               if (match_last < 0)
 944                 {
 945                   ret = REG_NOMATCH;
 946                   goto free_return;
 947                 }
 948             } while (mctx->state_log[match_last] == NULL
 949                      || !mctx->state_log[match_last]->halt);
 950           halt_node = check_halt_state_context (mctx,
 951                                                 mctx->state_log[match_last],
 952                                                 match_last);
 953         }
 954       ret = merge_state_array (dfa, sifted_states, lim_states,
 955                                match_last + 1);
 956       re_free (lim_states);
 957       lim_states = NULL;
 958       if (BE (ret != REG_NOERROR, 0))
 959         goto free_return;
 960     }
 961   else
 962     {
 963       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 964       ret = sift_states_backward (mctx, &sctx);
 965       re_node_set_free (&sctx.limits);
 966       if (BE (ret != REG_NOERROR, 0))
 967         goto free_return;
 968     }
 969   re_free (mctx->state_log);
 970   mctx->state_log = sifted_states;
 971   sifted_states = NULL;
 972   mctx->last_node = halt_node;
 973   mctx->match_last = match_last;
 974   ret = REG_NOERROR;
 975  free_return:
 976   re_free (sifted_states);
 977   re_free (lim_states);
 978   return ret;
 979 }
 980
 981 /* Acquire an initial state and return it.
 982    We must select appropriate initial state depending on the context,
 983    since initial states may have constraints like "\<", "^", etc..  */
 984
 985 static inline re_dfastate_t *
 986 __attribute ((always_inline)) internal_function
 987 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
 988                             int idx)
 989 {
 990   re_dfa_t *const dfa = mctx->dfa;
 991   if (dfa->init_state->has_constraint)
 992     {
 993       unsigned int context;
 994       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 995       if (IS_WORD_CONTEXT (context))
 996         return dfa->init_state_word;
 997       else if (IS_ORDINARY_CONTEXT (context))
 998         return dfa->init_state;
 999       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1000         return dfa->init_state_begbuf;
1001       else if (IS_NEWLINE_CONTEXT (context))
1002         return dfa->init_state_nl;
1003       else if (IS_BEGBUF_CONTEXT (context))
1004         {
1005           /* It is relatively rare case, then calculate on demand.  */
1006           return re_acquire_state_context (err, dfa,
1007                                            dfa->init_state->entrance_nodes,
1008                                            context);
1009         }
1010       else
1011         /* Must not happen?  */
1012         return dfa->init_state;
1013     }
1014   else
1015     return dfa->init_state;
1016 }
1017
1018 /* Check whether the regular expression match input string INPUT or not,
1019    and return the index where the matching end, return -1 if not match,
1020    or return -2 in case of an error.
1021    FL_LONGEST_MATCH means we want the POSIX longest matching.
1022    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1023    next place where we may want to try matching.
1024    Note that the matcher assume that the maching starts from the current
1025    index of the buffer.  */
1026
1027 static int
1028 internal_function
1029 check_matching (re_match_context_t *mctx, int fl_longest_match,
1030                 int *p_match_first)
1031 {
1032   re_dfa_t *const dfa = mctx->dfa;
1033   reg_errcode_t err;
1034   int match = 0;
1035   int match_last = -1;
1036   int cur_str_idx = re_string_cur_idx (&mctx->input);
1037   re_dfastate_t *cur_state;
1038   int at_init_state = p_match_first != NULL;
1039   int next_start_idx = cur_str_idx;
1040
1041   err = REG_NOERROR;
1042   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1043   /* An initial state must not be NULL (invalid).  */
1044   if (BE (cur_state == NULL, 0))
1045     {
1046       assert (err == REG_ESPACE);
1047       return -2;
1048     }
1049
1050   if (mctx->state_log != NULL)
1051     {
1052       mctx->state_log[cur_str_idx] = cur_state;
1053
1054       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1055          later.  E.g. Processing back references.  */
1056       if (BE (dfa->nbackref, 0))
1057         {
1058           at_init_state = 0;
1059           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1060           if (BE (err != REG_NOERROR, 0))
1061             return err;
1062
1063           if (cur_state->has_backref)
1064             {
1065               err = transit_state_bkref (mctx, &cur_state->nodes);
1066               if (BE (err != REG_NOERROR, 0))
1067                 return err;
1068             }
1069         }
1070     }
1071
1072   /* If the RE accepts NULL string.  */
1073   if (BE (cur_state->halt, 0))
1074     {
1075       if (!cur_state->has_constraint
1076           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1077         {
1078           if (!fl_longest_match)
1079             return cur_str_idx;
1080           else
1081             {
1082               match_last = cur_str_idx;
1083               match = 1;
1084             }
1085         }
1086     }
1087
1088   while (!re_string_eoi (&mctx->input))
1089     {
1090       re_dfastate_t *old_state = cur_state;
1091       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1092
1093       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1094           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1095               && mctx->input.valid_len < mctx->input.len))
1096         {
1097           err = extend_buffers (mctx);
1098           if (BE (err != REG_NOERROR, 0))
1099             {
1100               assert (err == REG_ESPACE);
1101               return -2;
1102             }
1103         }
1104
1105       cur_state = transit_state (&err, mctx, cur_state);
1106       if (mctx->state_log != NULL)
1107         cur_state = merge_state_with_log (&err, mctx, cur_state);
1108
1109       if (cur_state == NULL)
1110         {
1111           /* Reached the invalid state or an error.  Try to recover a valid
1112              state using the state log, if available and if we have not
1113              already found a valid (even if not the longest) match.  */
1114           if (BE (err != REG_NOERROR, 0))
1115             return -2;
1116
1117           if (mctx->state_log == NULL
1118               || (match && !fl_longest_match)
1119               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1120             break;
1121         }
1122
1123       if (BE (at_init_state, 0))
1124         {
1125           if (old_state == cur_state)
1126             next_start_idx = next_char_idx;
1127           else
1128             at_init_state = 0;
1129         }
1130
1131       if (cur_state->halt)
1132         {
1133           /* Reached a halt state.
1134              Check the halt state can satisfy the current context.  */
1135           if (!cur_state->has_constraint
1136               || check_halt_state_context (mctx, cur_state,
1137                                            re_string_cur_idx (&mctx->input)))
1138             {
1139               /* We found an appropriate halt state.  */
1140               match_last = re_string_cur_idx (&mctx->input);
1141               match = 1;
1142
1143               /* We found a match, do not modify match_first below.  */
1144               p_match_first = NULL;
1145               if (!fl_longest_match)
1146                 break;
1147             }
1148         }
1149     }
1150
1151   if (p_match_first)
1152     *p_match_first += next_start_idx;
1153
1154   return match_last;
1155 }
1156
1157 /* Check NODE match the current context.  */
1158
1159 static int
1160 internal_function
1161 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1162 {
1163   re_token_type_t type = dfa->nodes[node].type;
1164   unsigned int constraint = dfa->nodes[node].constraint;
1165   if (type != END_OF_RE)
1166     return 0;
1167   if (!constraint)
1168     return 1;
1169   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1170     return 0;
1171   return 1;
1172 }
1173
1174 /* Check the halt state STATE match the current context.
1175    Return 0 if not match, if the node, STATE has, is a halt node and
1176    match the context, return the node.  */
1177
1178 static int
1179 internal_function
1180 check_halt_state_context (const re_match_context_t *mctx,
1181                           const re_dfastate_t *state, int idx)
1182 {
1183   int i;
1184   unsigned int context;
1185 #ifdef DEBUG
1186   assert (state->halt);
1187 #endif
1188   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1189   for (i = 0; i < state->nodes.nelem; ++i)
1190     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1191       return state->nodes.elems[i];
1192   return 0;
1193 }
1194
1195 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1196    corresponding to the DFA).
1197    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1198    of errors.  */
1199
1200 static int
1201 internal_function
1202 proceed_next_node (const re_match_context_t *mctx,
1203                    int nregs, regmatch_t *regs, int *pidx, int node,
1204                    re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1205 {
1206   re_dfa_t *const dfa = mctx->dfa;
1207   int i, err;
1208   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1209     {
1210       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1211       re_node_set *edests = &dfa->edests[node];
1212       int dest_node;
1213       err = re_node_set_insert (eps_via_nodes, node);
1214       if (BE (err < 0, 0))
1215         return -2;
1216       /* Pick up a valid destination, or return -1 if none is found.  */
1217       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1218         {
1219           int candidate = edests->elems[i];
1220           if (!re_node_set_contains (cur_nodes, candidate))
1221             continue;
1222           if (dest_node == -1)
1223             dest_node = candidate;
1224
1225           else
1226             {
1227               /* In order to avoid infinite loop like "(a*)*", return the second
1228                  epsilon-transition if the first was already considered.  */
1229               if (re_node_set_contains (eps_via_nodes, dest_node))
1230                 return candidate;
1231
1232               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1233               else if (fs != NULL
1234                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1235                                            eps_via_nodes))
1236                 return -2;
1237
1238               /* We know we are going to exit.  */
1239               break;
1240             }
1241         }
1242       return dest_node;
1243     }
1244   else
1245     {
1246       int naccepted = 0;
1247       re_token_type_t type = dfa->nodes[node].type;
1248
1249 #ifdef RE_ENABLE_I18N
1250       if (dfa->nodes[node].accept_mb)
1251         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1252       else
1253 #endif /* RE_ENABLE_I18N */
1254       if (type == OP_BACK_REF)
1255         {
1256           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1257           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1258           if (fs != NULL)
1259             {
1260               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1261                 return -1;
1262               else if (naccepted)
1263                 {
1264                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1265                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1266                               naccepted) != 0)
1267                     return -1;
1268                 }
1269             }
1270
1271           if (naccepted == 0)
1272             {
1273               int dest_node;
1274               err = re_node_set_insert (eps_via_nodes, node);
1275               if (BE (err < 0, 0))
1276                 return -2;
1277               dest_node = dfa->edests[node].elems[0];
1278               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1279                                         dest_node))
1280                 return dest_node;
1281             }
1282         }
1283
1284       if (naccepted != 0
1285           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1286         {
1287           int dest_node = dfa->nexts[node];
1288           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1289           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1290                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1291                                                dest_node)))
1292             return -1;
1293           re_node_set_empty (eps_via_nodes);
1294           return dest_node;
1295         }
1296     }
1297   return -1;
1298 }
1299
1300 static reg_errcode_t
1301 internal_function
1302 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1303                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1304 {
1305   reg_errcode_t err;
1306   int num = fs->num++;
1307   if (fs->num == fs->alloc)
1308     {
1309       struct re_fail_stack_ent_t *new_array;
1310       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1311                                        * fs->alloc * 2));
1312       if (new_array == NULL)
1313         return REG_ESPACE;
1314       fs->alloc *= 2;
1315       fs->stack = new_array;
1316     }
1317   fs->stack[num].idx = str_idx;
1318   fs->stack[num].node = dest_node;
1319   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1320   if (fs->stack[num].regs == NULL)
1321     return REG_ESPACE;
1322   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1323   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1324   return err;
1325 }
1326
1327 static int
1328 internal_function
1329 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx,
1330                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1331 {
1332   int num = --fs->num;
1333   assert (num >= 0);
1334   *pidx = fs->stack[num].idx;
1335   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1336   re_node_set_free (eps_via_nodes);
1337   re_free (fs->stack[num].regs);
1338   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1339   return fs->stack[num].node;
1340 }
1341
1342 /* Set the positions where the subexpressions are starts/ends to registers
1343    PMATCH.
1344    Note: We assume that pmatch[0] is already set, and
1345    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1346
1347 static reg_errcode_t
1348 internal_function
1349 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1350           size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1351 {
1352   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1353   int idx, cur_node;
1354   re_node_set eps_via_nodes;
1355   struct re_fail_stack_t *fs;
1356   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1357   regmatch_t *prev_idx_match;
1358
1359 #ifdef DEBUG
1360   assert (nmatch > 1);
1361   assert (mctx->state_log != NULL);
1362 #endif
1363   if (fl_backtrack)
1364     {
1365       fs = &fs_body;
1366       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1367       if (fs->stack == NULL)
1368         return REG_ESPACE;
1369     }
1370   else
1371     fs = NULL;
1372
1373   cur_node = dfa->init_node;
1374   re_node_set_init_empty (&eps_via_nodes);
1375
1376   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1377   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1378
1379   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1380     {
1381       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1382
1383       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1384         {
1385           int reg_idx;
1386           if (fs)
1387             {
1388               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1389                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1390                   break;
1391               if (reg_idx == nmatch)
1392                 {
1393                   re_node_set_free (&eps_via_nodes);
1394                   return free_fail_stack_return (fs);
1395                 }
1396               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1397                                          &eps_via_nodes);
1398             }
1399           else
1400             {
1401               re_node_set_free (&eps_via_nodes);
1402               return REG_NOERROR;
1403             }
1404         }
1405
1406       /* Proceed to next node.  */
1407       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1408                                     &eps_via_nodes, fs);
1409
1410       if (BE (cur_node < 0, 0))
1411         {
1412           if (BE (cur_node == -2, 0))
1413             {
1414               re_node_set_free (&eps_via_nodes);
1415               free_fail_stack_return (fs);
1416               return REG_ESPACE;
1417             }
1418           if (fs)
1419             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1420                                        &eps_via_nodes);
1421           else
1422             {
1423               re_node_set_free (&eps_via_nodes);
1424               return REG_NOMATCH;
1425             }
1426         }
1427     }
1428   re_node_set_free (&eps_via_nodes);
1429   return free_fail_stack_return (fs);
1430 }
1431
1432 static reg_errcode_t
1433 internal_function
1434 free_fail_stack_return (struct re_fail_stack_t *fs)
1435 {
1436   if (fs)
1437     {
1438       int fs_idx;
1439       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1440         {
1441           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1442           re_free (fs->stack[fs_idx].regs);
1443         }
1444       re_free (fs->stack);
1445     }
1446   return REG_NOERROR;
1447 }
1448
1449 static void
1450 internal_function
1451 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1452              int cur_node, int cur_idx, int nmatch)
1453 {
1454   int type = dfa->nodes[cur_node].type;
1455   if (type == OP_OPEN_SUBEXP)
1456     {
1457       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1458
1459       /* We are at the first node of this sub expression.  */
1460       if (reg_num < nmatch)
1461         {
1462           pmatch[reg_num].rm_so = cur_idx;
1463           pmatch[reg_num].rm_eo = -1;
1464         }
1465     }
1466   else if (type == OP_CLOSE_SUBEXP)
1467     {
1468       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1469       if (reg_num < nmatch)
1470         {
1471           /* We are at the last node of this sub expression.  */
1472           if (pmatch[reg_num].rm_so < cur_idx)
1473             {
1474               pmatch[reg_num].rm_eo = cur_idx;
1475               /* This is a non-empty match or we are not inside an optional
1476                  subexpression.  Accept this right away.  */
1477               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1478             }
1479           else
1480             {
1481               if (dfa->nodes[cur_node].opt_subexp
1482                   && prev_idx_match[reg_num].rm_so != -1)
1483                 /* We transited through an empty match for an optional
1484                    subexpression, like (a?)*, and this is not the subexp's
1485                    first match.  Copy back the old content of the registers
1486                    so that matches of an inner subexpression are undone as
1487                    well, like in ((a?))*.  */
1488                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1489               else
1490                 /* We completed a subexpression, but it may be part of
1491                    an optional one, so do not update PREV_IDX_MATCH.  */
1492                 pmatch[reg_num].rm_eo = cur_idx;
1493             }
1494         }
1495     }
1496 }
1497
1498 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1499    and sift the nodes in each states according to the following rules.
1500    Updated state_log will be wrote to STATE_LOG.
1501
1502    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1503      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1504         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1505         the LAST_NODE, we throw away the node `a'.
1506      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1507         string `s' and transit to `b':
1508         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1509            away the node `a'.
1510         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1511             thrown away, we throw away the node `a'.
1512      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1513         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1514            node `a'.
1515         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1516             we throw away the node `a'.  */
1517
1518 #define STATE_NODE_CONTAINS(state,node) \
1519   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1520
1521 static reg_errcode_t
1522 internal_function
1523 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1524 {
1525   reg_errcode_t err;
1526   int null_cnt = 0;
1527   int str_idx = sctx->last_str_idx;
1528   re_node_set cur_dest;
1529
1530 #ifdef DEBUG
1531   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1532 #endif
1533
1534   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1535      transit to the last_node and the last_node itself.  */
1536   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1537   if (BE (err != REG_NOERROR, 0))
1538     return err;
1539   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1540   if (BE (err != REG_NOERROR, 0))
1541     goto free_return;
1542
1543   /* Then check each states in the state_log.  */
1544   while (str_idx > 0)
1545     {
1546       /* Update counters.  */
1547       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1548       if (null_cnt > mctx->max_mb_elem_len)
1549         {
1550           memset (sctx->sifted_states, '\0',
1551                   sizeof (re_dfastate_t *) * str_idx);
1552           re_node_set_free (&cur_dest);
1553           return REG_NOERROR;
1554         }
1555       re_node_set_empty (&cur_dest);
1556       --str_idx;
1557
1558       if (mctx->state_log[str_idx])
1559         {
1560           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1561           if (BE (err != REG_NOERROR, 0))
1562             goto free_return;
1563         }
1564
1565       /* Add all the nodes which satisfy the following conditions:
1566          - It can epsilon transit to a node in CUR_DEST.
1567          - It is in CUR_SRC.
1568          And update state_log.  */
1569       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1570       if (BE (err != REG_NOERROR, 0))
1571         goto free_return;
1572     }
1573   err = REG_NOERROR;
1574  free_return:
1575   re_node_set_free (&cur_dest);
1576   return err;
1577 }
1578
1579 static reg_errcode_t
1580 internal_function
1581 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1582                      int str_idx, re_node_set *cur_dest)
1583 {
1584   re_dfa_t *const dfa = mctx->dfa;
1585   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1586   int i;
1587
1588   /* Then build the next sifted state.
1589      We build the next sifted state on `cur_dest', and update
1590      `sifted_states[str_idx]' with `cur_dest'.
1591      Note:
1592      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1593      `cur_src' points the node_set of the old `state_log[str_idx]'
1594      (with the epsilon nodes pre-filtered out).  */
1595   for (i = 0; i < cur_src->nelem; i++)
1596     {
1597       int prev_node = cur_src->elems[i];
1598       int naccepted = 0;
1599       int ret;
1600
1601 #ifdef DEBUG
1602       re_token_type_t type = dfa->nodes[prev_node].type;
1603       assert (!IS_EPSILON_NODE (type));
1604 #endif
1605 #ifdef RE_ENABLE_I18N
1606       /* If the node may accept `multi byte'.  */
1607       if (dfa->nodes[prev_node].accept_mb)
1608         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1609                                          str_idx, sctx->last_str_idx);
1610 #endif /* RE_ENABLE_I18N */
1611
1612       /* We don't check backreferences here.
1613          See update_cur_sifted_state().  */
1614       if (!naccepted
1615           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1616           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1617                                   dfa->nexts[prev_node]))
1618         naccepted = 1;
1619
1620       if (naccepted == 0)
1621         continue;
1622
1623       if (sctx->limits.nelem)
1624         {
1625           int to_idx = str_idx + naccepted;
1626           if (check_dst_limits (mctx, &sctx->limits,
1627                                 dfa->nexts[prev_node], to_idx,
1628                                 prev_node, str_idx))
1629             continue;
1630         }
1631       ret = re_node_set_insert (cur_dest, prev_node);
1632       if (BE (ret == -1, 0))
1633         return REG_ESPACE;
1634     }
1635
1636   return REG_NOERROR;
1637 }
1638
1639 /* Helper functions.  */
1640
1641 static reg_errcode_t
1642 internal_function
1643 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1644 {
1645   int top = mctx->state_log_top;
1646
1647   if (next_state_log_idx >= mctx->input.bufs_len
1648       || (next_state_log_idx >= mctx->input.valid_len
1649           && mctx->input.valid_len < mctx->input.len))
1650     {
1651       reg_errcode_t err;
1652       err = extend_buffers (mctx);
1653       if (BE (err != REG_NOERROR, 0))
1654         return err;
1655     }
1656
1657   if (top < next_state_log_idx)
1658     {
1659       memset (mctx->state_log + top + 1, '\0',
1660               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1661       mctx->state_log_top = next_state_log_idx;
1662     }
1663   return REG_NOERROR;
1664 }
1665
1666 static reg_errcode_t
1667 internal_function
1668 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1669                    int num)
1670 {
1671   int st_idx;
1672   reg_errcode_t err;
1673   for (st_idx = 0; st_idx < num; ++st_idx)
1674     {
1675       if (dst[st_idx] == NULL)
1676         dst[st_idx] = src[st_idx];
1677       else if (src[st_idx] != NULL)
1678         {
1679           re_node_set merged_set;
1680           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1681                                         &src[st_idx]->nodes);
1682           if (BE (err != REG_NOERROR, 0))
1683             return err;
1684           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1685           re_node_set_free (&merged_set);
1686           if (BE (err != REG_NOERROR, 0))
1687             return err;
1688         }
1689     }
1690   return REG_NOERROR;
1691 }
1692
1693 static reg_errcode_t
1694 internal_function
1695 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1696                          int str_idx, re_node_set *dest_nodes)
1697 {
1698   re_dfa_t *const dfa = mctx->dfa;
1699   reg_errcode_t err;
1700   const re_node_set *candidates;
1701   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1702                 : &mctx->state_log[str_idx]->nodes);
1703
1704   if (dest_nodes->nelem == 0)
1705     sctx->sifted_states[str_idx] = NULL;
1706   else
1707     {
1708       if (candidates)
1709         {
1710           /* At first, add the nodes which can epsilon transit to a node in
1711              DEST_NODE.  */
1712           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1713           if (BE (err != REG_NOERROR, 0))
1714             return err;
1715
1716           /* Then, check the limitations in the current sift_context.  */
1717           if (sctx->limits.nelem)
1718             {
1719               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1720                                          mctx->bkref_ents, str_idx);
1721               if (BE (err != REG_NOERROR, 0))
1722                 return err;
1723             }
1724         }
1725
1726       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1727       if (BE (err != REG_NOERROR, 0))
1728         return err;
1729     }
1730
1731   if (candidates && mctx->state_log[str_idx]->has_backref)
1732     {
1733       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1734       if (BE (err != REG_NOERROR, 0))
1735         return err;
1736     }
1737   return REG_NOERROR;
1738 }
1739
1740 static reg_errcode_t
1741 internal_function
1742 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1743                        const re_node_set *candidates)
1744 {
1745   reg_errcode_t err = REG_NOERROR;
1746   int i;
1747
1748   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1749   if (BE (err != REG_NOERROR, 0))
1750     return err;
1751
1752   if (!state->inveclosure.alloc)
1753     {
1754       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1755       if (BE (err != REG_NOERROR, 0))
1756         return REG_ESPACE;
1757       for (i = 0; i < dest_nodes->nelem; i++)
1758         re_node_set_merge (&state->inveclosure,
1759                            dfa->inveclosures + dest_nodes->elems[i]);
1760     }
1761   return re_node_set_add_intersect (dest_nodes, candidates,
1762                                     &state->inveclosure);
1763 }
1764
1765 static reg_errcode_t
1766 internal_function
1767 sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1768                        const re_node_set *candidates)
1769 {
1770     int ecl_idx;
1771     reg_errcode_t err;
1772     re_node_set *inv_eclosure = dfa->inveclosures + node;
1773     re_node_set except_nodes;
1774     re_node_set_init_empty (&except_nodes);
1775     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1776       {
1777         int cur_node = inv_eclosure->elems[ecl_idx];
1778         if (cur_node == node)
1779           continue;
1780         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1781           {
1782             int edst1 = dfa->edests[cur_node].elems[0];
1783             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1784                          ? dfa->edests[cur_node].elems[1] : -1);
1785             if ((!re_node_set_contains (inv_eclosure, edst1)
1786                  && re_node_set_contains (dest_nodes, edst1))
1787                 || (edst2 > 0
1788                     && !re_node_set_contains (inv_eclosure, edst2)
1789                     && re_node_set_contains (dest_nodes, edst2)))
1790               {
1791                 err = re_node_set_add_intersect (&except_nodes, candidates,
1792                                                  dfa->inveclosures + cur_node);
1793                 if (BE (err != REG_NOERROR, 0))
1794                   {
1795                     re_node_set_free (&except_nodes);
1796                     return err;
1797                   }
1798               }
1799           }
1800       }
1801     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1802       {
1803         int cur_node = inv_eclosure->elems[ecl_idx];
1804         if (!re_node_set_contains (&except_nodes, cur_node))
1805           {
1806             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1807             re_node_set_remove_at (dest_nodes, idx);
1808           }
1809       }
1810     re_node_set_free (&except_nodes);
1811     return REG_NOERROR;
1812 }
1813
1814 static int
1815 internal_function
1816 check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
1817                   int dst_node, int dst_idx, int src_node, int src_idx)
1818 {
1819   re_dfa_t *const dfa = mctx->dfa;
1820   int lim_idx, src_pos, dst_pos;
1821
1822   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1823   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1824   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1825     {
1826       int subexp_idx;
1827       struct re_backref_cache_entry *ent;
1828       ent = mctx->bkref_ents + limits->elems[lim_idx];
1829       subexp_idx = dfa->nodes[ent->node].opr.idx;
1830
1831       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1832                                            subexp_idx, dst_node, dst_idx,
1833                                            dst_bkref_idx);
1834       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1835                                            subexp_idx, src_node, src_idx,
1836                                            src_bkref_idx);
1837
1838       /* In case of:
1839          <src> <dst> ( <subexp> )
1840          ( <subexp> ) <src> <dst>
1841          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1842       if (src_pos == dst_pos)
1843         continue; /* This is unrelated limitation.  */
1844       else
1845         return 1;
1846     }
1847   return 0;
1848 }
1849
1850 static int
1851 internal_function
1852 check_dst_limits_calc_pos_1 (re_match_context_t *mctx, int boundaries,
1853                              int subexp_idx, int from_node, int bkref_idx)
1854 {
1855   re_dfa_t *const dfa = mctx->dfa;
1856   re_node_set *eclosures = dfa->eclosures + from_node;
1857   int node_idx;
1858
1859   /* Else, we are on the boundary: examine the nodes on the epsilon
1860      closure.  */
1861   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1862     {
1863       int node = eclosures->elems[node_idx];
1864       switch (dfa->nodes[node].type)
1865         {
1866         case OP_BACK_REF:
1867           if (bkref_idx != -1)
1868             {
1869               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1870               do
1871                 {
1872                   int dst, cpos;
1873
1874                   if (ent->node != node)
1875                     continue;
1876
1877                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1878                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1879                     continue;
1880
1881                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1882                      OP_CLOSE_SUBEXP cases below.  But, if the
1883                      destination node is the same node as the source
1884                      node, don't recurse because it would cause an
1885                      infinite loop: a regex that exhibits this behavior
1886                      is ()\1*\1*  */
1887                   dst = dfa->edests[node].elems[0];
1888                   if (dst == from_node)
1889                     {
1890                       if (boundaries & 1)
1891                         return -1;
1892                       else /* if (boundaries & 2) */
1893                         return 0;
1894                     }
1895
1896                   cpos =
1897                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1898                                                  dst, bkref_idx);
1899                   if (cpos == -1 /* && (boundaries & 1) */)
1900                     return -1;
1901                   if (cpos == 0 && (boundaries & 2))
1902                     return 0;
1903
1904                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1905                 }
1906               while (ent++->more);
1907             }
1908           break;
1909
1910         case OP_OPEN_SUBEXP:
1911           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1912             return -1;
1913           break;
1914
1915         case OP_CLOSE_SUBEXP:
1916           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1917             return 0;
1918           break;
1919
1920         default:
1921             break;
1922         }
1923     }
1924
1925   return (boundaries & 2) ? 1 : 0;
1926 }
1927
1928 static int
1929 internal_function
1930 check_dst_limits_calc_pos (re_match_context_t *mctx, int limit, int subexp_idx,
1931                            int from_node, int str_idx, int bkref_idx)
1932 {
1933   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1934   int boundaries;
1935
1936   /* If we are outside the range of the subexpression, return -1 or 1.  */
1937   if (str_idx < lim->subexp_from)
1938     return -1;
1939
1940   if (lim->subexp_to < str_idx)
1941     return 1;
1942
1943   /* If we are within the subexpression, return 0.  */
1944   boundaries = (str_idx == lim->subexp_from);
1945   boundaries |= (str_idx == lim->subexp_to) << 1;
1946   if (boundaries == 0)
1947     return 0;
1948
1949   /* Else, examine epsilon closure.  */
1950   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1951                                       from_node, bkref_idx);
1952 }
1953
1954 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1955    which are against limitations from DEST_NODES. */
1956
1957 static reg_errcode_t
1958 internal_function
1959 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
1960                      const re_node_set *candidates, re_node_set *limits,
1961                      struct re_backref_cache_entry *bkref_ents, int str_idx)
1962 {
1963   reg_errcode_t err;
1964   int node_idx, lim_idx;
1965
1966   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1967     {
1968       int subexp_idx;
1969       struct re_backref_cache_entry *ent;
1970       ent = bkref_ents + limits->elems[lim_idx];
1971
1972       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1973         continue; /* This is unrelated limitation.  */
1974
1975       subexp_idx = dfa->nodes[ent->node].opr.idx;
1976       if (ent->subexp_to == str_idx)
1977         {
1978           int ops_node = -1;
1979           int cls_node = -1;
1980           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1981             {
1982               int node = dest_nodes->elems[node_idx];
1983               re_token_type_t type = dfa->nodes[node].type;
1984               if (type == OP_OPEN_SUBEXP
1985                   && subexp_idx == dfa->nodes[node].opr.idx)
1986                 ops_node = node;
1987               else if (type == OP_CLOSE_SUBEXP
1988                        && subexp_idx == dfa->nodes[node].opr.idx)
1989                 cls_node = node;
1990             }
1991
1992           /* Check the limitation of the open subexpression.  */
1993           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
1994           if (ops_node >= 0)
1995             {
1996               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1997                                            candidates);
1998               if (BE (err != REG_NOERROR, 0))
1999                 return err;
2000             }
2001
2002           /* Check the limitation of the close subexpression.  */
2003           if (cls_node >= 0)
2004             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2005               {
2006                 int node = dest_nodes->elems[node_idx];
2007                 if (!re_node_set_contains (dfa->inveclosures + node,
2008                                            cls_node)
2009                     && !re_node_set_contains (dfa->eclosures + node,
2010                                               cls_node))
2011                   {
2012                     /* It is against this limitation.
2013                        Remove it form the current sifted state.  */
2014                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2015                                                  candidates);
2016                     if (BE (err != REG_NOERROR, 0))
2017                       return err;
2018                     --node_idx;
2019                   }
2020               }
2021         }
2022       else /* (ent->subexp_to != str_idx)  */
2023         {
2024           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2025             {
2026               int node = dest_nodes->elems[node_idx];
2027               re_token_type_t type = dfa->nodes[node].type;
2028               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2029                 {
2030                   if (subexp_idx != dfa->nodes[node].opr.idx)
2031                     continue;
2032                   /* It is against this limitation.
2033                      Remove it form the current sifted state.  */
2034                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2035                                                candidates);
2036                   if (BE (err != REG_NOERROR, 0))
2037                     return err;
2038                 }
2039             }
2040         }
2041     }
2042   return REG_NOERROR;
2043 }
2044
2045 static reg_errcode_t
2046 internal_function
2047 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2048                    int str_idx, const re_node_set *candidates)
2049 {
2050   re_dfa_t *const dfa = mctx->dfa;
2051   reg_errcode_t err;
2052   int node_idx, node;
2053   re_sift_context_t local_sctx;
2054   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2055
2056   if (first_idx == -1)
2057     return REG_NOERROR;
2058
2059   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2060
2061   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2062     {
2063       int enabled_idx;
2064       re_token_type_t type;
2065       struct re_backref_cache_entry *entry;
2066       node = candidates->elems[node_idx];
2067       type = dfa->nodes[node].type;
2068       /* Avoid infinite loop for the REs like "()\1+".  */
2069       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2070         continue;
2071       if (type != OP_BACK_REF)
2072         continue;
2073
2074       entry = mctx->bkref_ents + first_idx;
2075       enabled_idx = first_idx;
2076       do
2077         {
2078           int subexp_len, to_idx, dst_node, ret;
2079           re_dfastate_t *cur_state;
2080
2081           if (entry->node != node)
2082             continue;
2083           subexp_len = entry->subexp_to - entry->subexp_from;
2084           to_idx = str_idx + subexp_len;
2085           dst_node = (subexp_len ? dfa->nexts[node]
2086                       : dfa->edests[node].elems[0]);
2087
2088           if (to_idx > sctx->last_str_idx
2089               || sctx->sifted_states[to_idx] == NULL
2090               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2091               || check_dst_limits (mctx, &sctx->limits, node,
2092                                    str_idx, dst_node, to_idx))
2093             continue;
2094
2095           if (local_sctx.sifted_states == NULL)
2096             {
2097               local_sctx = *sctx;
2098               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2099               if (BE (err != REG_NOERROR, 0))
2100                 goto free_return;
2101             }
2102           local_sctx.last_node = node;
2103           local_sctx.last_str_idx = str_idx;
2104           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2105           if (BE (ret < 0, 0))
2106             {
2107               err = REG_ESPACE;
2108               goto free_return;
2109             }
2110           cur_state = local_sctx.sifted_states[str_idx];
2111           err = sift_states_backward (mctx, &local_sctx);
2112           if (BE (err != REG_NOERROR, 0))
2113             goto free_return;
2114           if (sctx->limited_states != NULL)
2115             {
2116               err = merge_state_array (dfa, sctx->limited_states,
2117                                        local_sctx.sifted_states,
2118                                        str_idx + 1);
2119               if (BE (err != REG_NOERROR, 0))
2120                 goto free_return;
2121             }
2122           local_sctx.sifted_states[str_idx] = cur_state;
2123           re_node_set_remove (&local_sctx.limits, enabled_idx);
2124
2125           /* mctx->bkref_ents may have changed, reload the pointer.  */
2126           entry = mctx->bkref_ents + enabled_idx;
2127         }
2128       while (enabled_idx++, entry++->more);
2129     }
2130   err = REG_NOERROR;
2131  free_return:
2132   if (local_sctx.sifted_states != NULL)
2133     {
2134       re_node_set_free (&local_sctx.limits);
2135     }
2136
2137   return err;
2138 }
2139
2140
2141 #ifdef RE_ENABLE_I18N
2142 static int
2143 internal_function
2144 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2145                      int node_idx, int str_idx, int max_str_idx)
2146 {
2147   re_dfa_t *const dfa = mctx->dfa;
2148   int naccepted;
2149   /* Check the node can accept `multi byte'.  */
2150   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2151   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2152       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2153                             dfa->nexts[node_idx]))
2154     /* The node can't accept the `multi byte', or the
2155        destination was already thrown away, then the node
2156        could't accept the current input `multi byte'.   */
2157     naccepted = 0;
2158   /* Otherwise, it is sure that the node could accept
2159      `naccepted' bytes input.  */
2160   return naccepted;
2161 }
2162 #endif /* RE_ENABLE_I18N */
2163
2164 \f
2165 /* Functions for state transition.  */
2166
2167 /* Return the next state to which the current state STATE will transit by
2168    accepting the current input byte, and update STATE_LOG if necessary.
2169    If STATE can accept a multibyte char/collating element/back reference
2170    update the destination of STATE_LOG.  */
2171
2172 static re_dfastate_t *
2173 internal_function
2174 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2175                re_dfastate_t *state)
2176 {
2177   re_dfastate_t **trtable;
2178   unsigned char ch;
2179
2180 #ifdef RE_ENABLE_I18N
2181   /* If the current state can accept multibyte.  */
2182   if (BE (state->accept_mb, 0))
2183     {
2184       *err = transit_state_mb (mctx, state);
2185       if (BE (*err != REG_NOERROR, 0))
2186         return NULL;
2187     }
2188 #endif /* RE_ENABLE_I18N */
2189
2190   /* Then decide the next state with the single byte.  */
2191 #if 0
2192   if (0)
2193     /* don't use transition table  */
2194     return transit_state_sb (err, mctx, state);
2195 #endif
2196
2197   /* Use transition table  */
2198   ch = re_string_fetch_byte (&mctx->input);
2199   for (;;)
2200     {
2201       trtable = state->trtable;
2202       if (BE (trtable != NULL, 1))
2203         return trtable[ch];
2204
2205       trtable = state->word_trtable;
2206       if (BE (trtable != NULL, 1))
2207         {
2208           unsigned int context;
2209           context
2210             = re_string_context_at (&mctx->input,
2211                                     re_string_cur_idx (&mctx->input) - 1,
2212                                     mctx->eflags);
2213           if (IS_WORD_CONTEXT (context))
2214             return trtable[ch + SBC_MAX];
2215           else
2216             return trtable[ch];
2217         }
2218
2219       if (!build_trtable (mctx->dfa, state))
2220         {
2221           *err = REG_ESPACE;
2222           return NULL;
2223         }
2224
2225       /* Retry, we now have a transition table.  */
2226     }
2227 }
2228
2229 /* Update the state_log if we need */
2230 re_dfastate_t *
2231 internal_function
2232 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2233                       re_dfastate_t *next_state)
2234 {
2235   re_dfa_t *const dfa = mctx->dfa;
2236   int cur_idx = re_string_cur_idx (&mctx->input);
2237
2238   if (cur_idx > mctx->state_log_top)
2239     {
2240       mctx->state_log[cur_idx] = next_state;
2241       mctx->state_log_top = cur_idx;
2242     }
2243   else if (mctx->state_log[cur_idx] == 0)
2244     {
2245       mctx->state_log[cur_idx] = next_state;
2246     }
2247   else
2248     {
2249       re_dfastate_t *pstate;
2250       unsigned int context;
2251       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2252       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2253          the destination of a multibyte char/collating element/
2254          back reference.  Then the next state is the union set of
2255          these destinations and the results of the transition table.  */
2256       pstate = mctx->state_log[cur_idx];
2257       log_nodes = pstate->entrance_nodes;
2258       if (next_state != NULL)
2259         {
2260           table_nodes = next_state->entrance_nodes;
2261           *err = re_node_set_init_union (&next_nodes, table_nodes,
2262                                              log_nodes);
2263           if (BE (*err != REG_NOERROR, 0))
2264             return NULL;
2265         }
2266       else
2267         next_nodes = *log_nodes;
2268       /* Note: We already add the nodes of the initial state,
2269          then we don't need to add them here.  */
2270
2271       context = re_string_context_at (&mctx->input,
2272                                       re_string_cur_idx (&mctx->input) - 1,
2273                                       mctx->eflags);
2274       next_state = mctx->state_log[cur_idx]
2275         = re_acquire_state_context (err, dfa, &next_nodes, context);
2276       /* We don't need to check errors here, since the return value of
2277          this function is next_state and ERR is already set.  */
2278
2279       if (table_nodes != NULL)
2280         re_node_set_free (&next_nodes);
2281     }
2282
2283   if (BE (dfa->nbackref, 0) && next_state != NULL)
2284     {
2285       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2286          later.  We must check them here, since the back references in the
2287          next state might use them.  */
2288       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2289                                         cur_idx);
2290       if (BE (*err != REG_NOERROR, 0))
2291         return NULL;
2292
2293       /* If the next state has back references.  */
2294       if (next_state->has_backref)
2295         {
2296           *err = transit_state_bkref (mctx, &next_state->nodes);
2297           if (BE (*err != REG_NOERROR, 0))
2298             return NULL;
2299           next_state = mctx->state_log[cur_idx];
2300         }
2301     }
2302
2303   return next_state;
2304 }
2305
2306 /* Skip bytes in the input that correspond to part of a
2307    multi-byte match, then look in the log for a state
2308    from which to restart matching.  */
2309 static re_dfastate_t *
2310 internal_function
2311 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2312 {
2313   re_dfastate_t *cur_state = NULL;
2314   do
2315     {
2316       int max = mctx->state_log_top;
2317       int cur_str_idx = re_string_cur_idx (&mctx->input);
2318
2319       do
2320         {
2321           if (++cur_str_idx > max)
2322             return NULL;
2323           re_string_skip_bytes (&mctx->input, 1);
2324         }
2325       while (mctx->state_log[cur_str_idx] == NULL);
2326
2327       cur_state = merge_state_with_log (err, mctx, NULL);
2328     }
2329   while (err == REG_NOERROR && cur_state == NULL);
2330   return cur_state;
2331 }
2332
2333 /* Helper functions for transit_state.  */
2334
2335 /* From the node set CUR_NODES, pick up the nodes whose types are
2336    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2337    expression. And register them to use them later for evaluating the
2338    correspoding back references.  */
2339
2340 static reg_errcode_t
2341 internal_function
2342 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2343                            int str_idx)
2344 {
2345   re_dfa_t *const dfa = mctx->dfa;
2346   int node_idx;
2347   reg_errcode_t err;
2348
2349   /* TODO: This isn't efficient.
2350            Because there might be more than one nodes whose types are
2351            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2352            nodes.
2353            E.g. RE: (a){2}  */
2354   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2355     {
2356       int node = cur_nodes->elems[node_idx];
2357       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2358           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2359           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2360         {
2361           err = match_ctx_add_subtop (mctx, node, str_idx);
2362           if (BE (err != REG_NOERROR, 0))
2363             return err;
2364         }
2365     }
2366   return REG_NOERROR;
2367 }
2368
2369 #if 0
2370 /* Return the next state to which the current state STATE will transit by
2371    accepting the current input byte.  */
2372
2373 static re_dfastate_t *
2374 transit_state_sb (err, mctx, state)
2375      reg_errcode_t *err;
2376      re_match_context_t *mctx;
2377      re_dfastate_t *state;
2378 {
2379   re_dfa_t *const dfa = mctx->dfa;
2380   re_node_set next_nodes;
2381   re_dfastate_t *next_state;
2382   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2383   unsigned int context;
2384
2385   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2386   if (BE (*err != REG_NOERROR, 0))
2387     return NULL;
2388   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2389     {
2390       int cur_node = state->nodes.elems[node_cnt];
2391       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2392         {
2393           *err = re_node_set_merge (&next_nodes,
2394                                     dfa->eclosures + dfa->nexts[cur_node]);
2395           if (BE (*err != REG_NOERROR, 0))
2396             {
2397               re_node_set_free (&next_nodes);
2398               return NULL;
2399             }
2400         }
2401     }
2402   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2403   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2404   /* We don't need to check errors here, since the return value of
2405      this function is next_state and ERR is already set.  */
2406
2407   re_node_set_free (&next_nodes);
2408   re_string_skip_bytes (&mctx->input, 1);
2409   return next_state;
2410 }
2411 #endif
2412
2413 #ifdef RE_ENABLE_I18N
2414 static reg_errcode_t
2415 internal_function
2416 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2417 {
2418   re_dfa_t *const dfa = mctx->dfa;
2419   reg_errcode_t err;
2420   int i;
2421
2422   for (i = 0; i < pstate->nodes.nelem; ++i)
2423     {
2424       re_node_set dest_nodes, *new_nodes;
2425       int cur_node_idx = pstate->nodes.elems[i];
2426       int naccepted, dest_idx;
2427       unsigned int context;
2428       re_dfastate_t *dest_state;
2429
2430       if (!dfa->nodes[cur_node_idx].accept_mb)
2431         continue;
2432
2433       if (dfa->nodes[cur_node_idx].constraint)
2434         {
2435           context = re_string_context_at (&mctx->input,
2436                                           re_string_cur_idx (&mctx->input),
2437                                           mctx->eflags);
2438           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2439                                            context))
2440             continue;
2441         }
2442
2443       /* How many bytes the node can accept?  */
2444       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2445                                            re_string_cur_idx (&mctx->input));
2446       if (naccepted == 0)
2447         continue;
2448
2449       /* The node can accepts `naccepted' bytes.  */
2450       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2451       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2452                                : mctx->max_mb_elem_len);
2453       err = clean_state_log_if_needed (mctx, dest_idx);
2454       if (BE (err != REG_NOERROR, 0))
2455         return err;
2456 #ifdef DEBUG
2457       assert (dfa->nexts[cur_node_idx] != -1);
2458 #endif
2459       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2460
2461       dest_state = mctx->state_log[dest_idx];
2462       if (dest_state == NULL)
2463         dest_nodes = *new_nodes;
2464       else
2465         {
2466           err = re_node_set_init_union (&dest_nodes,
2467                                         dest_state->entrance_nodes, new_nodes);
2468           if (BE (err != REG_NOERROR, 0))
2469             return err;
2470         }
2471       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2472       mctx->state_log[dest_idx]
2473         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2474       if (dest_state != NULL)
2475         re_node_set_free (&dest_nodes);
2476       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2477         return err;
2478     }
2479   return REG_NOERROR;
2480 }
2481 #endif /* RE_ENABLE_I18N */
2482
2483 static reg_errcode_t
2484 internal_function
2485 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2486 {
2487   re_dfa_t *const dfa = mctx->dfa;
2488   reg_errcode_t err;
2489   int i;
2490   int cur_str_idx = re_string_cur_idx (&mctx->input);
2491
2492   for (i = 0; i < nodes->nelem; ++i)
2493     {
2494       int dest_str_idx, prev_nelem, bkc_idx;
2495       int node_idx = nodes->elems[i];
2496       unsigned int context;
2497       const re_token_t *node = dfa->nodes + node_idx;
2498       re_node_set *new_dest_nodes;
2499
2500       /* Check whether `node' is a backreference or not.  */
2501       if (node->type != OP_BACK_REF)
2502         continue;
2503
2504       if (node->constraint)
2505         {
2506           context = re_string_context_at (&mctx->input, cur_str_idx,
2507                                           mctx->eflags);
2508           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2509             continue;
2510         }
2511
2512       /* `node' is a backreference.
2513          Check the substring which the substring matched.  */
2514       bkc_idx = mctx->nbkref_ents;
2515       err = get_subexp (mctx, node_idx, cur_str_idx);
2516       if (BE (err != REG_NOERROR, 0))
2517         goto free_return;
2518
2519       /* And add the epsilon closures (which is `new_dest_nodes') of
2520          the backreference to appropriate state_log.  */
2521 #ifdef DEBUG
2522       assert (dfa->nexts[node_idx] != -1);
2523 #endif
2524       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2525         {
2526           int subexp_len;
2527           re_dfastate_t *dest_state;
2528           struct re_backref_cache_entry *bkref_ent;
2529           bkref_ent = mctx->bkref_ents + bkc_idx;
2530           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2531             continue;
2532           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2533           new_dest_nodes = (subexp_len == 0
2534                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2535                             : dfa->eclosures + dfa->nexts[node_idx]);
2536           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2537                           - bkref_ent->subexp_from);
2538           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2539                                           mctx->eflags);
2540           dest_state = mctx->state_log[dest_str_idx];
2541           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2542                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2543           /* Add `new_dest_node' to state_log.  */
2544           if (dest_state == NULL)
2545             {
2546               mctx->state_log[dest_str_idx]
2547                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2548                                             context);
2549               if (BE (mctx->state_log[dest_str_idx] == NULL
2550                       && err != REG_NOERROR, 0))
2551                 goto free_return;
2552             }
2553           else
2554             {
2555               re_node_set dest_nodes;
2556               err = re_node_set_init_union (&dest_nodes,
2557                                             dest_state->entrance_nodes,
2558                                             new_dest_nodes);
2559               if (BE (err != REG_NOERROR, 0))
2560                 {
2561                   re_node_set_free (&dest_nodes);
2562                   goto free_return;
2563                 }
2564               mctx->state_log[dest_str_idx]
2565                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2566               re_node_set_free (&dest_nodes);
2567               if (BE (mctx->state_log[dest_str_idx] == NULL
2568                       && err != REG_NOERROR, 0))
2569                 goto free_return;
2570             }
2571           /* We need to check recursively if the backreference can epsilon
2572              transit.  */
2573           if (subexp_len == 0
2574               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2575             {
2576               err = check_subexp_matching_top (mctx, new_dest_nodes,
2577                                                cur_str_idx);
2578               if (BE (err != REG_NOERROR, 0))
2579                 goto free_return;
2580               err = transit_state_bkref (mctx, new_dest_nodes);
2581               if (BE (err != REG_NOERROR, 0))
2582                 goto free_return;
2583             }
2584         }
2585     }
2586   err = REG_NOERROR;
2587  free_return:
2588   return err;
2589 }
2590
2591 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2592    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2593    Note that we might collect inappropriate candidates here.
2594    However, the cost of checking them strictly here is too high, then we
2595    delay these checking for prune_impossible_nodes().  */
2596
2597 static reg_errcode_t
2598 internal_function
2599 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2600 {
2601   re_dfa_t *const dfa = mctx->dfa;
2602   int subexp_num, sub_top_idx;
2603   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2604   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2605   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2606   if (cache_idx != -1)
2607     {
2608       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2609       do
2610         if (entry->node == bkref_node)
2611           return REG_NOERROR; /* We already checked it.  */
2612       while (entry++->more);
2613     }
2614
2615   subexp_num = dfa->nodes[bkref_node].opr.idx;
2616
2617   /* For each sub expression  */
2618   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2619     {
2620       reg_errcode_t err;
2621       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2622       re_sub_match_last_t *sub_last;
2623       int sub_last_idx, sl_str, bkref_str_off;
2624
2625       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2626         continue; /* It isn't related.  */
2627
2628       sl_str = sub_top->str_idx;
2629       bkref_str_off = bkref_str_idx;
2630       /* At first, check the last node of sub expressions we already
2631          evaluated.  */
2632       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2633         {
2634           int sl_str_diff;
2635           sub_last = sub_top->lasts[sub_last_idx];
2636           sl_str_diff = sub_last->str_idx - sl_str;
2637           /* The matched string by the sub expression match with the substring
2638              at the back reference?  */
2639           if (sl_str_diff > 0)
2640             {
2641               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2642                 {
2643                   /* Not enough chars for a successful match.  */
2644                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2645                     break;
2646
2647                   err = clean_state_log_if_needed (mctx,
2648                                                    bkref_str_off
2649                                                    + sl_str_diff);
2650                   if (BE (err != REG_NOERROR, 0))
2651                     return err;
2652                   buf = (const char *) re_string_get_buffer (&mctx->input);
2653                 }
2654               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2655                 break; /* We don't need to search this sub expression any more.  */
2656             }
2657           bkref_str_off += sl_str_diff;
2658           sl_str += sl_str_diff;
2659           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2660                                 bkref_str_idx);
2661
2662           /* Reload buf, since the preceding call might have reallocated
2663              the buffer.  */
2664           buf = (const char *) re_string_get_buffer (&mctx->input);
2665
2666           if (err == REG_NOMATCH)
2667             continue;
2668           if (BE (err != REG_NOERROR, 0))
2669             return err;
2670         }
2671
2672       if (sub_last_idx < sub_top->nlasts)
2673         continue;
2674       if (sub_last_idx > 0)
2675         ++sl_str;
2676       /* Then, search for the other last nodes of the sub expression.  */
2677       for (; sl_str <= bkref_str_idx; ++sl_str)
2678         {
2679           int cls_node, sl_str_off;
2680           const re_node_set *nodes;
2681           sl_str_off = sl_str - sub_top->str_idx;
2682           /* The matched string by the sub expression match with the substring
2683              at the back reference?  */
2684           if (sl_str_off > 0)
2685             {
2686               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2687                 {
2688                   /* If we are at the end of the input, we cannot match.  */
2689                   if (bkref_str_off >= mctx->input.len)
2690                     break;
2691
2692                   err = extend_buffers (mctx);
2693                   if (BE (err != REG_NOERROR, 0))
2694                     return err;
2695
2696                   buf = (const char *) re_string_get_buffer (&mctx->input);
2697                 }
2698               if (buf [bkref_str_off++] != buf[sl_str - 1])
2699                 break; /* We don't need to search this sub expression
2700                           any more.  */
2701             }
2702           if (mctx->state_log[sl_str] == NULL)
2703             continue;
2704           /* Does this state have a ')' of the sub expression?  */
2705           nodes = &mctx->state_log[sl_str]->nodes;
2706           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2707           if (cls_node == -1)
2708             continue; /* No.  */
2709           if (sub_top->path == NULL)
2710             {
2711               sub_top->path = calloc (sizeof (state_array_t),
2712                                       sl_str - sub_top->str_idx + 1);
2713               if (sub_top->path == NULL)
2714                 return REG_ESPACE;
2715             }
2716           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2717              in the current context?  */
2718           err = check_arrival (mctx, sub_top->path, sub_top->node,
2719                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2720           if (err == REG_NOMATCH)
2721               continue;
2722           if (BE (err != REG_NOERROR, 0))
2723               return err;
2724           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2725           if (BE (sub_last == NULL, 0))
2726             return REG_ESPACE;
2727           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2728                                 bkref_str_idx);
2729           if (err == REG_NOMATCH)
2730             continue;
2731         }
2732     }
2733   return REG_NOERROR;
2734 }
2735
2736 /* Helper functions for get_subexp().  */
2737
2738 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2739    If it can arrive, register the sub expression expressed with SUB_TOP
2740    and SUB_LAST.  */
2741
2742 static reg_errcode_t
2743 internal_function
2744 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2745                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2746 {
2747   reg_errcode_t err;
2748   int to_idx;
2749   /* Can the subexpression arrive the back reference?  */
2750   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2751                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2752   if (err != REG_NOERROR)
2753     return err;
2754   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2755                              sub_last->str_idx);
2756   if (BE (err != REG_NOERROR, 0))
2757     return err;
2758   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2759   return clean_state_log_if_needed (mctx, to_idx);
2760 }
2761
2762 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2763    Search '(' if FL_OPEN, or search ')' otherwise.
2764    TODO: This function isn't efficient...
2765          Because there might be more than one nodes whose types are
2766          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2767          nodes.
2768          E.g. RE: (a){2}  */
2769
2770 static int
2771 internal_function
2772 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2773                   int subexp_idx, int type)
2774 {
2775   int cls_idx;
2776   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2777     {
2778       int cls_node = nodes->elems[cls_idx];
2779       const re_token_t *node = dfa->nodes + cls_node;
2780       if (node->type == type
2781           && node->opr.idx == subexp_idx)
2782         return cls_node;
2783     }
2784   return -1;
2785 }
2786
2787 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2788    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2789    heavily reused.
2790    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2791
2792 static reg_errcode_t
2793 internal_function
2794 check_arrival (re_match_context_t *mctx, state_array_t *path,
2795                int top_node, int top_str, int last_node, int last_str,
2796                int type)
2797 {
2798   re_dfa_t *const dfa = mctx->dfa;
2799   reg_errcode_t err;
2800   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2801   re_dfastate_t *cur_state = NULL;
2802   re_node_set *cur_nodes, next_nodes;
2803   re_dfastate_t **backup_state_log;
2804   unsigned int context;
2805
2806   subexp_num = dfa->nodes[top_node].opr.idx;
2807   /* Extend the buffer if we need.  */
2808   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2809     {
2810       re_dfastate_t **new_array;
2811       int old_alloc = path->alloc;
2812       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2813       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2814       if (new_array == NULL)
2815         {
2816           path->alloc = old_alloc;
2817           return REG_ESPACE;
2818         }
2819       path->array = new_array;
2820       memset (new_array + old_alloc, '\0',
2821               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2822     }
2823
2824   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2825
2826   /* Temporary modify MCTX.  */
2827   backup_state_log = mctx->state_log;
2828   backup_cur_idx = mctx->input.cur_idx;
2829   mctx->state_log = path->array;
2830   mctx->input.cur_idx = str_idx;
2831
2832   /* Setup initial node set.  */
2833   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2834   if (str_idx == top_str)
2835     {
2836       err = re_node_set_init_1 (&next_nodes, top_node);
2837       if (BE (err != REG_NOERROR, 0))
2838         return err;
2839       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2840       if (BE (err != REG_NOERROR, 0))
2841         {
2842           re_node_set_free (&next_nodes);
2843           return err;
2844         }
2845     }
2846   else
2847     {
2848       cur_state = mctx->state_log[str_idx];
2849       if (cur_state && cur_state->has_backref)
2850         {
2851           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2852           if (BE ( err != REG_NOERROR, 0))
2853             return err;
2854         }
2855       else
2856         re_node_set_init_empty (&next_nodes);
2857     }
2858   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2859     {
2860       if (next_nodes.nelem)
2861         {
2862           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2863                                     subexp_num, type);
2864           if (BE ( err != REG_NOERROR, 0))
2865             {
2866               re_node_set_free (&next_nodes);
2867               return err;
2868             }
2869         }
2870       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2871       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2872         {
2873           re_node_set_free (&next_nodes);
2874           return err;
2875         }
2876       mctx->state_log[str_idx] = cur_state;
2877     }
2878
2879   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2880     {
2881       re_node_set_empty (&next_nodes);
2882       if (mctx->state_log[str_idx + 1])
2883         {
2884           err = re_node_set_merge (&next_nodes,
2885                                    &mctx->state_log[str_idx + 1]->nodes);
2886           if (BE (err != REG_NOERROR, 0))
2887             {
2888               re_node_set_free (&next_nodes);
2889               return err;
2890             }
2891         }
2892       if (cur_state)
2893         {
2894           err = check_arrival_add_next_nodes (mctx, str_idx,
2895                                               &cur_state->non_eps_nodes, &next_nodes);
2896           if (BE (err != REG_NOERROR, 0))
2897             {
2898               re_node_set_free (&next_nodes);
2899               return err;
2900             }
2901         }
2902       ++str_idx;
2903       if (next_nodes.nelem)
2904         {
2905           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2906           if (BE (err != REG_NOERROR, 0))
2907             {
2908               re_node_set_free (&next_nodes);
2909               return err;
2910             }
2911           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2912                                     subexp_num, type);
2913           if (BE ( err != REG_NOERROR, 0))
2914             {
2915               re_node_set_free (&next_nodes);
2916               return err;
2917             }
2918         }
2919       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2920       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2921       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2922         {
2923           re_node_set_free (&next_nodes);
2924           return err;
2925         }
2926       mctx->state_log[str_idx] = cur_state;
2927       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2928     }
2929   re_node_set_free (&next_nodes);
2930   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2931                : &mctx->state_log[last_str]->nodes);
2932   path->next_idx = str_idx;
2933
2934   /* Fix MCTX.  */
2935   mctx->state_log = backup_state_log;
2936   mctx->input.cur_idx = backup_cur_idx;
2937
2938   /* Then check the current node set has the node LAST_NODE.  */
2939   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2940     return REG_NOERROR;
2941
2942   return REG_NOMATCH;
2943 }
2944
2945 /* Helper functions for check_arrival.  */
2946
2947 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2948    to NEXT_NODES.
2949    TODO: This function is similar to the functions transit_state*(),
2950          however this function has many additional works.
2951          Can't we unify them?  */
2952
2953 static reg_errcode_t
2954 internal_function
2955 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2956                               re_node_set *cur_nodes,
2957                               re_node_set *next_nodes)
2958 {
2959   re_dfa_t *const dfa = mctx->dfa;
2960   int result;
2961   int cur_idx;
2962   reg_errcode_t err;
2963   re_node_set union_set;
2964   re_node_set_init_empty (&union_set);
2965   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2966     {
2967       int naccepted = 0;
2968       int cur_node = cur_nodes->elems[cur_idx];
2969 #ifdef DEBUG
2970       re_token_type_t type = dfa->nodes[cur_node].type;
2971       assert (!IS_EPSILON_NODE (type));
2972 #endif
2973 #ifdef RE_ENABLE_I18N
2974       /* If the node may accept `multi byte'.  */
2975       if (dfa->nodes[cur_node].accept_mb)
2976         {
2977           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2978                                                str_idx);
2979           if (naccepted > 1)
2980             {
2981               re_dfastate_t *dest_state;
2982               int next_node = dfa->nexts[cur_node];
2983               int next_idx = str_idx + naccepted;
2984               dest_state = mctx->state_log[next_idx];
2985               re_node_set_empty (&union_set);
2986               if (dest_state)
2987                 {
2988                   err = re_node_set_merge (&union_set, &dest_state->nodes);
2989                   if (BE (err != REG_NOERROR, 0))
2990                     {
2991                       re_node_set_free (&union_set);
2992                       return err;
2993                     }
2994                 }
2995               result = re_node_set_insert (&union_set, next_node);
2996               if (BE (result < 0, 0))
2997                 {
2998                   re_node_set_free (&union_set);
2999                   return REG_ESPACE;
3000                 }
3001               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3002                                                             &union_set);
3003               if (BE (mctx->state_log[next_idx] == NULL
3004                       && err != REG_NOERROR, 0))
3005                 {
3006                   re_node_set_free (&union_set);
3007                   return err;
3008                 }
3009             }
3010         }
3011 #endif /* RE_ENABLE_I18N */
3012       if (naccepted
3013           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3014         {
3015           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3016           if (BE (result < 0, 0))
3017             {
3018               re_node_set_free (&union_set);
3019               return REG_ESPACE;
3020             }
3021         }
3022     }
3023   re_node_set_free (&union_set);
3024   return REG_NOERROR;
3025 }
3026
3027 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3028    CUR_NODES, however exclude the nodes which are:
3029     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3030     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3031 */
3032
3033 static reg_errcode_t
3034 internal_function
3035 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3036                           int ex_subexp, int type)
3037 {
3038   reg_errcode_t err;
3039   int idx, outside_node;
3040   re_node_set new_nodes;
3041 #ifdef DEBUG
3042   assert (cur_nodes->nelem);
3043 #endif
3044   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3045   if (BE (err != REG_NOERROR, 0))
3046     return err;
3047   /* Create a new node set NEW_NODES with the nodes which are epsilon
3048      closures of the node in CUR_NODES.  */
3049
3050   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3051     {
3052       int cur_node = cur_nodes->elems[idx];
3053       re_node_set *eclosure = dfa->eclosures + cur_node;
3054       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3055       if (outside_node == -1)
3056         {
3057           /* There are no problematic nodes, just merge them.  */
3058           err = re_node_set_merge (&new_nodes, eclosure);
3059           if (BE (err != REG_NOERROR, 0))
3060             {
3061               re_node_set_free (&new_nodes);
3062               return err;
3063             }
3064         }
3065       else
3066         {
3067           /* There are problematic nodes, re-calculate incrementally.  */
3068           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3069                                               ex_subexp, type);
3070           if (BE (err != REG_NOERROR, 0))
3071             {
3072               re_node_set_free (&new_nodes);
3073               return err;
3074             }
3075         }
3076     }
3077   re_node_set_free (cur_nodes);
3078   *cur_nodes = new_nodes;
3079   return REG_NOERROR;
3080 }
3081
3082 /* Helper function for check_arrival_expand_ecl.
3083    Check incrementally the epsilon closure of TARGET, and if it isn't
3084    problematic append it to DST_NODES.  */
3085
3086 static reg_errcode_t
3087 internal_function
3088 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3089                               int target, int ex_subexp, int type)
3090 {
3091   int cur_node;
3092   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3093     {
3094       int err;
3095
3096       if (dfa->nodes[cur_node].type == type
3097           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3098         {
3099           if (type == OP_CLOSE_SUBEXP)
3100             {
3101               err = re_node_set_insert (dst_nodes, cur_node);
3102               if (BE (err == -1, 0))
3103                 return REG_ESPACE;
3104             }
3105           break;
3106         }
3107       err = re_node_set_insert (dst_nodes, cur_node);
3108       if (BE (err == -1, 0))
3109         return REG_ESPACE;
3110       if (dfa->edests[cur_node].nelem == 0)
3111         break;
3112       if (dfa->edests[cur_node].nelem == 2)
3113         {
3114           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3115                                               dfa->edests[cur_node].elems[1],
3116                                               ex_subexp, type);
3117           if (BE (err != REG_NOERROR, 0))
3118             return err;
3119         }
3120       cur_node = dfa->edests[cur_node].elems[0];
3121     }
3122   return REG_NOERROR;
3123 }
3124
3125
3126 /* For all the back references in the current state, calculate the
3127    destination of the back references by the appropriate entry
3128    in MCTX->BKREF_ENTS.  */
3129
3130 static reg_errcode_t
3131 internal_function
3132 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3133                     int cur_str, int subexp_num, int type)
3134 {
3135   re_dfa_t *const dfa = mctx->dfa;
3136   reg_errcode_t err;
3137   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3138   struct re_backref_cache_entry *ent;
3139
3140   if (cache_idx_start == -1)
3141     return REG_NOERROR;
3142
3143  restart:
3144   ent = mctx->bkref_ents + cache_idx_start;
3145   do
3146     {
3147       int to_idx, next_node;
3148
3149       /* Is this entry ENT is appropriate?  */
3150       if (!re_node_set_contains (cur_nodes, ent->node))
3151         continue; /* No.  */
3152
3153       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3154       /* Calculate the destination of the back reference, and append it
3155          to MCTX->STATE_LOG.  */
3156       if (to_idx == cur_str)
3157         {
3158           /* The backreference did epsilon transit, we must re-check all the
3159              node in the current state.  */
3160           re_node_set new_dests;
3161           reg_errcode_t err2, err3;
3162           next_node = dfa->edests[ent->node].elems[0];
3163           if (re_node_set_contains (cur_nodes, next_node))
3164             continue;
3165           err = re_node_set_init_1 (&new_dests, next_node);
3166           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3167           err3 = re_node_set_merge (cur_nodes, &new_dests);
3168           re_node_set_free (&new_dests);
3169           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3170                   || err3 != REG_NOERROR, 0))
3171             {
3172               err = (err != REG_NOERROR ? err
3173                      : (err2 != REG_NOERROR ? err2 : err3));
3174               return err;
3175             }
3176           /* TODO: It is still inefficient...  */
3177           goto restart;
3178         }
3179       else
3180         {
3181           re_node_set union_set;
3182           next_node = dfa->nexts[ent->node];
3183           if (mctx->state_log[to_idx])
3184             {
3185               int ret;
3186               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3187                                         next_node))
3188                 continue;
3189               err = re_node_set_init_copy (&union_set,
3190                                            &mctx->state_log[to_idx]->nodes);
3191               ret = re_node_set_insert (&union_set, next_node);
3192               if (BE (err != REG_NOERROR || ret < 0, 0))
3193                 {
3194                   re_node_set_free (&union_set);
3195                   err = err != REG_NOERROR ? err : REG_ESPACE;
3196                   return err;
3197                 }
3198             }
3199           else
3200             {
3201               err = re_node_set_init_1 (&union_set, next_node);
3202               if (BE (err != REG_NOERROR, 0))
3203                 return err;
3204             }
3205           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3206           re_node_set_free (&union_set);
3207           if (BE (mctx->state_log[to_idx] == NULL
3208                   && err != REG_NOERROR, 0))
3209             return err;
3210         }
3211     }
3212   while (ent++->more);
3213   return REG_NOERROR;
3214 }
3215
3216 /* Build transition table for the state.
3217    Return 1 if succeeded, otherwise return NULL.  */
3218
3219 static int
3220 internal_function
3221 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3222 {
3223   reg_errcode_t err;
3224   int i, j, ch, need_word_trtable = 0;
3225   unsigned int elem, mask;
3226   int dests_node_malloced = 0, dest_states_malloced = 0;
3227   int ndests; /* Number of the destination states from `state'.  */
3228   re_dfastate_t **trtable;
3229   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3230   re_node_set follows, *dests_node;
3231   bitset *dests_ch;
3232   bitset acceptable;
3233
3234   /* We build DFA states which corresponds to the destination nodes
3235      from `state'.  `dests_node[i]' represents the nodes which i-th
3236      destination state contains, and `dests_ch[i]' represents the
3237      characters which i-th destination state accepts.  */
3238 #ifdef _LIBC
3239   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3240     dests_node = (re_node_set *)
3241       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3242   else
3243 #endif
3244     {
3245       dests_node = (re_node_set *)
3246         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3247       if (BE (dests_node == NULL, 0))
3248         return 0;
3249       dests_node_malloced = 1;
3250     }
3251   dests_ch = (bitset *) (dests_node + SBC_MAX);
3252
3253   /* Initialize transiton table.  */
3254   state->word_trtable = state->trtable = NULL;
3255
3256   /* At first, group all nodes belonging to `state' into several
3257      destinations.  */
3258   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3259   if (BE (ndests <= 0, 0))
3260     {
3261       if (dests_node_malloced)
3262         free (dests_node);
3263       /* Return 0 in case of an error, 1 otherwise.  */
3264       if (ndests == 0)
3265         {
3266           state->trtable = (re_dfastate_t **)
3267             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3268           return 1;
3269         }
3270       return 0;
3271     }
3272
3273   err = re_node_set_alloc (&follows, ndests + 1);
3274   if (BE (err != REG_NOERROR, 0))
3275     goto out_free;
3276
3277 #ifdef _LIBC
3278   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3279                          + ndests * 3 * sizeof (re_dfastate_t *)))
3280     dest_states = (re_dfastate_t **)
3281       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3282   else
3283 #endif
3284     {
3285       dest_states = (re_dfastate_t **)
3286         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3287       if (BE (dest_states == NULL, 0))
3288         {
3289 out_free:
3290           if (dest_states_malloced)
3291             free (dest_states);
3292           re_node_set_free (&follows);
3293           for (i = 0; i < ndests; ++i)
3294             re_node_set_free (dests_node + i);
3295           if (dests_node_malloced)
3296             free (dests_node);
3297           return 0;
3298         }
3299       dest_states_malloced = 1;
3300     }
3301   dest_states_word = dest_states + ndests;
3302   dest_states_nl = dest_states_word + ndests;
3303   bitset_empty (acceptable);
3304
3305   /* Then build the states for all destinations.  */
3306   for (i = 0; i < ndests; ++i)
3307     {
3308       int next_node;
3309       re_node_set_empty (&follows);
3310       /* Merge the follows of this destination states.  */
3311       for (j = 0; j < dests_node[i].nelem; ++j)
3312         {
3313           next_node = dfa->nexts[dests_node[i].elems[j]];
3314           if (next_node != -1)
3315             {
3316               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3317               if (BE (err != REG_NOERROR, 0))
3318                 goto out_free;
3319             }
3320         }
3321       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3322       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3323         goto out_free;
3324       /* If the new state has context constraint,
3325          build appropriate states for these contexts.  */
3326       if (dest_states[i]->has_constraint)
3327         {
3328           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3329                                                           CONTEXT_WORD);
3330           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3331             goto out_free;
3332
3333           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3334             need_word_trtable = 1;
3335
3336           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3337                                                         CONTEXT_NEWLINE);
3338           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3339             goto out_free;
3340         }
3341       else
3342         {
3343           dest_states_word[i] = dest_states[i];
3344           dest_states_nl[i] = dest_states[i];
3345         }
3346       bitset_merge (acceptable, dests_ch[i]);
3347     }
3348
3349   if (!BE (need_word_trtable, 0))
3350     {
3351       /* We don't care about whether the following character is a word
3352          character, or we are in a single-byte character set so we can
3353          discern by looking at the character code: allocate a
3354          256-entry transition table.  */
3355       trtable = state->trtable =
3356         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3357       if (BE (trtable == NULL, 0))
3358         goto out_free;
3359
3360       /* For all characters ch...:  */
3361       for (i = 0; i < BITSET_UINTS; ++i)
3362         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3363              elem;
3364              mask <<= 1, elem >>= 1, ++ch)
3365           if (BE (elem & 1, 0))
3366             {
3367               /* There must be exactly one destination which accepts
3368                  character ch.  See group_nodes_into_DFAstates.  */
3369               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3370                 ;
3371
3372               /* j-th destination accepts the word character ch.  */
3373               if (dfa->word_char[i] & mask)
3374                 trtable[ch] = dest_states_word[j];
3375               else
3376                 trtable[ch] = dest_states[j];
3377             }
3378     }
3379   else
3380     {
3381       /* We care about whether the following character is a word
3382          character, and we are in a multi-byte character set: discern
3383          by looking at the character code: build two 256-entry
3384          transition tables, one starting at trtable[0] and one
3385          starting at trtable[SBC_MAX].  */
3386       trtable = state->word_trtable =
3387         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3388       if (BE (trtable == NULL, 0))
3389         goto out_free;
3390
3391       /* For all characters ch...:  */
3392       for (i = 0; i < BITSET_UINTS; ++i)
3393         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3394              elem;
3395              mask <<= 1, elem >>= 1, ++ch)
3396           if (BE (elem & 1, 0))
3397             {
3398               /* There must be exactly one destination which accepts
3399                  character ch.  See group_nodes_into_DFAstates.  */
3400               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3401                 ;
3402
3403               /* j-th destination accepts the word character ch.  */
3404               trtable[ch] = dest_states[j];
3405               trtable[ch + SBC_MAX] = dest_states_word[j];
3406             }
3407     }
3408
3409   /* new line */
3410   if (bitset_contain (acceptable, NEWLINE_CHAR))
3411     {
3412       /* The current state accepts newline character.  */
3413       for (j = 0; j < ndests; ++j)
3414         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3415           {
3416             /* k-th destination accepts newline character.  */
3417             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3418             if (need_word_trtable)
3419               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3420             /* There must be only one destination which accepts
3421                newline.  See group_nodes_into_DFAstates.  */
3422             break;
3423           }
3424     }
3425
3426   if (dest_states_malloced)
3427     free (dest_states);
3428
3429   re_node_set_free (&follows);
3430   for (i = 0; i < ndests; ++i)
3431     re_node_set_free (dests_node + i);
3432
3433   if (dests_node_malloced)
3434     free (dests_node);
3435
3436   return 1;
3437 }
3438
3439 /* Group all nodes belonging to STATE into several destinations.
3440    Then for all destinations, set the nodes belonging to the destination
3441    to DESTS_NODE[i] and set the characters accepted by the destination
3442    to DEST_CH[i].  This function return the number of destinations.  */
3443
3444 static int
3445 internal_function
3446 group_nodes_into_DFAstates (re_dfa_t *dfa, const re_dfastate_t *state,
3447                             re_node_set *dests_node, bitset *dests_ch)
3448 {
3449   reg_errcode_t err;
3450   int result;
3451   int i, j, k;
3452   int ndests; /* Number of the destinations from `state'.  */
3453   bitset accepts; /* Characters a node can accept.  */
3454   const re_node_set *cur_nodes = &state->nodes;
3455   bitset_empty (accepts);
3456   ndests = 0;
3457
3458   /* For all the nodes belonging to `state',  */
3459   for (i = 0; i < cur_nodes->nelem; ++i)
3460     {
3461       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3462       re_token_type_t type = node->type;
3463       unsigned int constraint = node->constraint;
3464
3465       /* Enumerate all single byte character this node can accept.  */
3466       if (type == CHARACTER)
3467         bitset_set (accepts, node->opr.c);
3468       else if (type == SIMPLE_BRACKET)
3469         {
3470           bitset_merge (accepts, node->opr.sbcset);
3471         }
3472       else if (type == OP_PERIOD)
3473         {
3474 #ifdef RE_ENABLE_I18N
3475           if (dfa->mb_cur_max > 1)
3476             bitset_merge (accepts, dfa->sb_char);
3477           else
3478 #endif
3479             bitset_set_all (accepts);
3480           if (!(dfa->syntax & REG_DOT_NEWLINE))
3481             bitset_clear (accepts, '\n');
3482           if (dfa->syntax & REG_DOT_NOT_NULL)
3483             bitset_clear (accepts, '\0');
3484         }
3485 #ifdef RE_ENABLE_I18N
3486       else if (type == OP_UTF8_PERIOD)
3487         {
3488           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3489           if (!(dfa->syntax & REG_DOT_NEWLINE))
3490             bitset_clear (accepts, '\n');
3491           if (dfa->syntax & REG_DOT_NOT_NULL)
3492             bitset_clear (accepts, '\0');
3493         }
3494 #endif
3495       else
3496         continue;
3497
3498       /* Check the `accepts' and sift the characters which are not
3499          match it the context.  */
3500       if (constraint)
3501         {
3502           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3503             {
3504               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3505               bitset_empty (accepts);
3506               if (accepts_newline)
3507                 bitset_set (accepts, NEWLINE_CHAR);
3508               else
3509                 continue;
3510             }
3511           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3512             {
3513               bitset_empty (accepts);
3514               continue;
3515             }
3516
3517           if (constraint & NEXT_WORD_CONSTRAINT)
3518             {
3519               unsigned int any_set = 0;
3520               if (type == CHARACTER && !node->word_char)
3521                 {
3522                   bitset_empty (accepts);
3523                   continue;
3524                 }
3525 #ifdef RE_ENABLE_I18N
3526               if (dfa->mb_cur_max > 1)
3527                 for (j = 0; j < BITSET_UINTS; ++j)
3528                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3529               else
3530 #endif
3531                 for (j = 0; j < BITSET_UINTS; ++j)
3532                   any_set |= (accepts[j] &= dfa->word_char[j]);
3533               if (!any_set)
3534                 continue;
3535             }
3536           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3537             {
3538               unsigned int any_set = 0;
3539               if (type == CHARACTER && node->word_char)
3540                 {
3541                   bitset_empty (accepts);
3542                   continue;
3543                 }
3544 #ifdef RE_ENABLE_I18N
3545               if (dfa->mb_cur_max > 1)
3546                 for (j = 0; j < BITSET_UINTS; ++j)
3547                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3548               else
3549 #endif
3550                 for (j = 0; j < BITSET_UINTS; ++j)
3551                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3552               if (!any_set)
3553                 continue;
3554             }
3555         }
3556
3557       /* Then divide `accepts' into DFA states, or create a new
3558          state.  Above, we make sure that accepts is not empty.  */
3559       for (j = 0; j < ndests; ++j)
3560         {
3561           bitset intersec; /* Intersection sets, see below.  */
3562           bitset remains;
3563           /* Flags, see below.  */
3564           int has_intersec, not_subset, not_consumed;
3565
3566           /* Optimization, skip if this state doesn't accept the character.  */
3567           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3568             continue;
3569
3570           /* Enumerate the intersection set of this state and `accepts'.  */
3571           has_intersec = 0;
3572           for (k = 0; k < BITSET_UINTS; ++k)
3573             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3574           /* And skip if the intersection set is empty.  */
3575           if (!has_intersec)
3576             continue;
3577
3578           /* Then check if this state is a subset of `accepts'.  */
3579           not_subset = not_consumed = 0;
3580           for (k = 0; k < BITSET_UINTS; ++k)
3581             {
3582               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3583               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3584             }
3585
3586           /* If this state isn't a subset of `accepts', create a
3587              new group state, which has the `remains'. */
3588           if (not_subset)
3589             {
3590               bitset_copy (dests_ch[ndests], remains);
3591               bitset_copy (dests_ch[j], intersec);
3592               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3593               if (BE (err != REG_NOERROR, 0))
3594                 goto error_return;
3595               ++ndests;
3596             }
3597
3598           /* Put the position in the current group. */
3599           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3600           if (BE (result < 0, 0))
3601             goto error_return;
3602
3603           /* If all characters are consumed, go to next node. */
3604           if (!not_consumed)
3605             break;
3606         }
3607       /* Some characters remain, create a new group. */
3608       if (j == ndests)
3609         {
3610           bitset_copy (dests_ch[ndests], accepts);
3611           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3612           if (BE (err != REG_NOERROR, 0))
3613             goto error_return;
3614           ++ndests;
3615           bitset_empty (accepts);
3616         }
3617     }
3618   return ndests;
3619  error_return:
3620   for (j = 0; j < ndests; ++j)
3621     re_node_set_free (dests_node + j);
3622   return -1;
3623 }
3624
3625 #ifdef RE_ENABLE_I18N
3626 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3627    Return the number of the bytes the node accepts.
3628    STR_IDX is the current index of the input string.
3629
3630    This function handles the nodes which can accept one character, or
3631    one collating element like '.', '[a-z]', opposite to the other nodes
3632    can only accept one byte.  */
3633
3634 static int
3635 internal_function
3636 check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3637                          const re_string_t *input, int str_idx)
3638 {
3639   const re_token_t *node = dfa->nodes + node_idx;
3640   int char_len, elem_len;
3641   int i;
3642
3643   if (BE (node->type == OP_UTF8_PERIOD, 0))
3644     {
3645       unsigned char c = re_string_byte_at (input, str_idx), d;
3646       if (BE (c < 0xc2, 1))
3647         return 0;
3648
3649       if (str_idx + 2 > input->len)
3650         return 0;
3651
3652       d = re_string_byte_at (input, str_idx + 1);
3653       if (c < 0xe0)
3654         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3655       else if (c < 0xf0)
3656         {
3657           char_len = 3;
3658           if (c == 0xe0 && d < 0xa0)
3659             return 0;
3660         }
3661       else if (c < 0xf8)
3662         {
3663           char_len = 4;
3664           if (c == 0xf0 && d < 0x90)
3665             return 0;
3666         }
3667       else if (c < 0xfc)
3668         {
3669           char_len = 5;
3670           if (c == 0xf8 && d < 0x88)
3671             return 0;
3672         }
3673       else if (c < 0xfe)
3674         {
3675           char_len = 6;
3676           if (c == 0xfc && d < 0x84)
3677             return 0;
3678         }
3679       else
3680         return 0;
3681
3682       if (str_idx + char_len > input->len)
3683         return 0;
3684
3685       for (i = 1; i < char_len; ++i)
3686         {
3687           d = re_string_byte_at (input, str_idx + i);
3688           if (d < 0x80 || d > 0xbf)
3689             return 0;
3690         }
3691       return char_len;
3692     }
3693
3694   char_len = re_string_char_size_at (input, str_idx);
3695   if (node->type == OP_PERIOD)
3696     {
3697       if (char_len <= 1)
3698         return 0;
3699       /* FIXME: I don't think this if is needed, as both '\n'
3700          and '\0' are char_len == 1.  */
3701       /* '.' accepts any one character except the following two cases.  */
3702       if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3703            re_string_byte_at (input, str_idx) == '\n') ||
3704           ((dfa->syntax & REG_DOT_NOT_NULL) &&
3705            re_string_byte_at (input, str_idx) == '\0'))
3706         return 0;
3707       return char_len;
3708     }
3709
3710   elem_len = re_string_elem_size_at (input, str_idx);
3711   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3712     return 0;
3713
3714   if (node->type == COMPLEX_BRACKET)
3715     {
3716       const re_charset_t *cset = node->opr.mbcset;
3717 # ifdef _LIBC
3718       const unsigned char *pin
3719         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3720       int j;
3721       uint32_t nrules;
3722 # endif /* _LIBC */
3723       int match_len = 0;
3724       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3725                     ? re_string_wchar_at (input, str_idx) : 0);
3726
3727       /* match with multibyte character?  */
3728       for (i = 0; i < cset->nmbchars; ++i)
3729         if (wc == cset->mbchars[i])
3730           {
3731             match_len = char_len;
3732             goto check_node_accept_bytes_match;
3733           }
3734       /* match with character_class?  */
3735       for (i = 0; i < cset->nchar_classes; ++i)
3736         {
3737           wctype_t wt = cset->char_classes[i];
3738           if (__iswctype (wc, wt))
3739             {
3740               match_len = char_len;
3741               goto check_node_accept_bytes_match;
3742             }
3743         }
3744
3745 # ifdef _LIBC
3746       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3747       if (nrules != 0)
3748         {
3749           unsigned int in_collseq = 0;
3750           const int32_t *table, *indirect;
3751           const unsigned char *weights, *extra;
3752           const char *collseqwc;
3753           int32_t idx;
3754           /* This #include defines a local function!  */
3755 #  include <locale/weight.h>
3756
3757           /* match with collating_symbol?  */
3758           if (cset->ncoll_syms)
3759             extra = (const unsigned char *)
3760               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3761           for (i = 0; i < cset->ncoll_syms; ++i)
3762             {
3763               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3764               /* Compare the length of input collating element and
3765                  the length of current collating element.  */
3766               if (*coll_sym != elem_len)
3767                 continue;
3768               /* Compare each bytes.  */
3769               for (j = 0; j < *coll_sym; j++)
3770                 if (pin[j] != coll_sym[1 + j])
3771                   break;
3772               if (j == *coll_sym)
3773                 {
3774                   /* Match if every bytes is equal.  */
3775                   match_len = j;
3776                   goto check_node_accept_bytes_match;
3777                 }
3778             }
3779
3780           if (cset->nranges)
3781             {
3782               if (elem_len <= char_len)
3783                 {
3784                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3785                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3786                 }
3787               else
3788                 in_collseq = find_collation_sequence_value (pin, elem_len);
3789             }
3790           /* match with range expression?  */
3791           for (i = 0; i < cset->nranges; ++i)
3792             if (cset->range_starts[i] <= in_collseq
3793                 && in_collseq <= cset->range_ends[i])
3794               {
3795                 match_len = elem_len;
3796                 goto check_node_accept_bytes_match;
3797               }
3798
3799           /* match with equivalence_class?  */
3800           if (cset->nequiv_classes)
3801             {
3802               const unsigned char *cp = pin;
3803               table = (const int32_t *)
3804                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3805               weights = (const unsigned char *)
3806                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3807               extra = (const unsigned char *)
3808                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3809               indirect = (const int32_t *)
3810                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3811               idx = findidx (&cp);
3812               if (idx > 0)
3813                 for (i = 0; i < cset->nequiv_classes; ++i)
3814                   {
3815                     int32_t equiv_class_idx = cset->equiv_classes[i];
3816                     size_t weight_len = weights[idx];
3817                     if (weight_len == weights[equiv_class_idx])
3818                       {
3819                         int cnt = 0;
3820                         while (cnt <= weight_len
3821                                && (weights[equiv_class_idx + 1 + cnt]
3822                                    == weights[idx + 1 + cnt]))
3823                           ++cnt;
3824                         if (cnt > weight_len)
3825                           {
3826                             match_len = elem_len;
3827                             goto check_node_accept_bytes_match;
3828                           }
3829                       }
3830                   }
3831             }
3832         }
3833       else
3834 # endif /* _LIBC */
3835         {
3836           /* match with range expression?  */
3837 #if __GNUC__ >= 2
3838           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3839 #else
3840           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3841           cmp_buf[2] = wc;
3842 #endif
3843           for (i = 0; i < cset->nranges; ++i)
3844             {
3845               cmp_buf[0] = cset->range_starts[i];
3846               cmp_buf[4] = cset->range_ends[i];
3847               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3848                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3849                 {
3850                   match_len = char_len;
3851                   goto check_node_accept_bytes_match;
3852                 }
3853             }
3854         }
3855     check_node_accept_bytes_match:
3856       if (!cset->non_match)
3857         return match_len;
3858       else
3859         {
3860           if (match_len > 0)
3861             return 0;
3862           else
3863             return (elem_len > char_len) ? elem_len : char_len;
3864         }
3865     }
3866   return 0;
3867 }
3868
3869 # ifdef _LIBC
3870 static unsigned int
3871 find_collation_sequence_value (mbs, mbs_len)
3872     const unsigned char *mbs;
3873     size_t mbs_len;
3874 {
3875   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3876   if (nrules == 0)
3877     {
3878       if (mbs_len == 1)
3879         {
3880           /* No valid character.  Match it as a single byte character.  */
3881           const unsigned char *collseq = (const unsigned char *)
3882             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3883           return collseq[mbs[0]];
3884         }
3885       return UINT_MAX;
3886     }
3887   else
3888     {
3889       int32_t idx;
3890       const unsigned char *extra = (const unsigned char *)
3891         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3892       int32_t extrasize = (const unsigned char *)
3893         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3894
3895       for (idx = 0; idx < extrasize;)
3896         {
3897           int mbs_cnt, found = 0;
3898           int32_t elem_mbs_len;
3899           /* Skip the name of collating element name.  */
3900           idx = idx + extra[idx] + 1;
3901           elem_mbs_len = extra[idx++];
3902           if (mbs_len == elem_mbs_len)
3903             {
3904               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3905                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3906                   break;
3907               if (mbs_cnt == elem_mbs_len)
3908                 /* Found the entry.  */
3909                 found = 1;
3910             }
3911           /* Skip the byte sequence of the collating element.  */
3912           idx += elem_mbs_len;
3913           /* Adjust for the alignment.  */
3914           idx = (idx + 3) & ~3;
3915           /* Skip the collation sequence value.  */
3916           idx += sizeof (uint32_t);
3917           /* Skip the wide char sequence of the collating element.  */
3918           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3919           /* If we found the entry, return the sequence value.  */
3920           if (found)
3921             return *(uint32_t *) (extra + idx);
3922           /* Skip the collation sequence value.  */
3923           idx += sizeof (uint32_t);
3924         }
3925       return UINT_MAX;
3926     }
3927 }
3928 # endif /* _LIBC */
3929 #endif /* RE_ENABLE_I18N */
3930
3931 /* Check whether the node accepts the byte which is IDX-th
3932    byte of the INPUT.  */
3933
3934 static int
3935 internal_function
3936 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3937                    int idx)
3938 {
3939   unsigned char ch;
3940   ch = re_string_byte_at (&mctx->input, idx);
3941   switch (node->type)
3942     {
3943     case CHARACTER:
3944       if (node->opr.c != ch)
3945         return 0;
3946       break;
3947
3948     case SIMPLE_BRACKET:
3949       if (!bitset_contain (node->opr.sbcset, ch))
3950         return 0;
3951       break;
3952
3953 #ifdef RE_ENABLE_I18N
3954     case OP_UTF8_PERIOD:
3955       if (ch >= 0x80)
3956         return 0;
3957       /* FALLTHROUGH */
3958 #endif
3959     case OP_PERIOD:
3960       if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
3961           || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
3962         return 0;
3963       break;
3964
3965     default:
3966       return 0;
3967     }
3968
3969   if (node->constraint)
3970     {
3971       /* The node has constraints.  Check whether the current context
3972          satisfies the constraints.  */
3973       unsigned int context = re_string_context_at (&mctx->input, idx,
3974                                                    mctx->eflags);
3975       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3976         return 0;
3977     }
3978
3979   return 1;
3980 }
3981
3982 /* Extend the buffers, if the buffers have run out.  */
3983
3984 static reg_errcode_t
3985 internal_function
3986 extend_buffers (re_match_context_t *mctx)
3987 {
3988   reg_errcode_t ret;
3989   re_string_t *pstr = &mctx->input;
3990
3991   /* Double the lengthes of the buffers.  */
3992   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3993   if (BE (ret != REG_NOERROR, 0))
3994     return ret;
3995
3996   if (mctx->state_log != NULL)
3997     {
3998       /* And double the length of state_log.  */
3999       /* XXX We have no indication of the size of this buffer.  If this
4000          allocation fail we have no indication that the state_log array
4001          does not have the right size.  */
4002       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4003                                               pstr->bufs_len + 1);
4004       if (BE (new_array == NULL, 0))
4005         return REG_ESPACE;
4006       mctx->state_log = new_array;
4007     }
4008
4009   /* Then reconstruct the buffers.  */
4010   if (pstr->icase)
4011     {
4012 #ifdef RE_ENABLE_I18N
4013       if (pstr->mb_cur_max > 1)
4014         {
4015           ret = build_wcs_upper_buffer (pstr);
4016           if (BE (ret != REG_NOERROR, 0))
4017             return ret;
4018         }
4019       else
4020 #endif /* RE_ENABLE_I18N  */
4021         build_upper_buffer (pstr);
4022     }
4023   else
4024     {
4025 #ifdef RE_ENABLE_I18N
4026       if (pstr->mb_cur_max > 1)
4027         build_wcs_buffer (pstr);
4028       else
4029 #endif /* RE_ENABLE_I18N  */
4030         {
4031           if (pstr->trans != NULL)
4032             re_string_translate_buffer (pstr);
4033         }
4034     }
4035   return REG_NOERROR;
4036 }
4037
4038 \f
4039 /* Functions for matching context.  */
4040
4041 /* Initialize MCTX.  */
4042
4043 static reg_errcode_t
4044 internal_function
4045 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4046 {
4047   mctx->eflags = eflags;
4048   mctx->match_last = -1;
4049   if (n > 0)
4050     {
4051       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4052       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4053       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4054         return REG_ESPACE;
4055     }
4056   /* Already zero-ed by the caller.
4057      else
4058        mctx->bkref_ents = NULL;
4059      mctx->nbkref_ents = 0;
4060      mctx->nsub_tops = 0;  */
4061   mctx->abkref_ents = n;
4062   mctx->max_mb_elem_len = 1;
4063   mctx->asub_tops = n;
4064   return REG_NOERROR;
4065 }
4066
4067 /* Clean the entries which depend on the current input in MCTX.
4068    This function must be invoked when the matcher changes the start index
4069    of the input, or changes the input string.  */
4070
4071 static void
4072 internal_function
4073 match_ctx_clean (re_match_context_t *mctx)
4074 {
4075   int st_idx;
4076   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4077     {
4078       int sl_idx;
4079       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4080       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4081         {
4082           re_sub_match_last_t *last = top->lasts[sl_idx];
4083           re_free (last->path.array);
4084           re_free (last);
4085         }
4086       re_free (top->lasts);
4087       if (top->path)
4088         {
4089           re_free (top->path->array);
4090           re_free (top->path);
4091         }
4092       free (top);
4093     }
4094
4095   mctx->nsub_tops = 0;
4096   mctx->nbkref_ents = 0;
4097 }
4098
4099 /* Free all the memory associated with MCTX.  */
4100
4101 static void
4102 internal_function
4103 match_ctx_free (re_match_context_t *mctx)
4104 {
4105   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4106   match_ctx_clean (mctx);
4107   re_free (mctx->sub_tops);
4108   re_free (mctx->bkref_ents);
4109 }
4110
4111 /* Add a new backreference entry to MCTX.
4112    Note that we assume that caller never call this function with duplicate
4113    entry, and call with STR_IDX which isn't smaller than any existing entry.
4114 */
4115
4116 static reg_errcode_t
4117 internal_function
4118 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx,
4119                      int from, int to)
4120 {
4121   if (mctx->nbkref_ents >= mctx->abkref_ents)
4122     {
4123       struct re_backref_cache_entry* new_entry;
4124       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4125                               mctx->abkref_ents * 2);
4126       if (BE (new_entry == NULL, 0))
4127         {
4128           re_free (mctx->bkref_ents);
4129           return REG_ESPACE;
4130         }
4131       mctx->bkref_ents = new_entry;
4132       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4133               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4134       mctx->abkref_ents *= 2;
4135     }
4136   if (mctx->nbkref_ents > 0
4137       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4138     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4139
4140   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4141   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4142   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4143   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4144
4145   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4146      If bit N is clear, means that this entry won't epsilon-transition to
4147      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4148      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4149      such node.
4150
4151      A backreference does not epsilon-transition unless it is empty, so set
4152      to all zeros if FROM != TO.  */
4153   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4154     = (from == to ? ~0 : 0);
4155
4156   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4157   if (mctx->max_mb_elem_len < to - from)
4158     mctx->max_mb_elem_len = to - from;
4159   return REG_NOERROR;
4160 }
4161
4162 /* Search for the first entry which has the same str_idx, or -1 if none is
4163    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4164
4165 static int
4166 internal_function
4167 search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
4168 {
4169   int left, right, mid, last;
4170   last = right = mctx->nbkref_ents;
4171   for (left = 0; left < right;)
4172     {
4173       mid = (left + right) / 2;
4174       if (mctx->bkref_ents[mid].str_idx < str_idx)
4175         left = mid + 1;
4176       else
4177         right = mid;
4178     }
4179   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4180     return left;
4181   else
4182     return -1;
4183 }
4184
4185 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4186    at STR_IDX.  */
4187
4188 static reg_errcode_t
4189 internal_function
4190 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4191 {
4192 #ifdef DEBUG
4193   assert (mctx->sub_tops != NULL);
4194   assert (mctx->asub_tops > 0);
4195 #endif
4196   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4197     {
4198       int new_asub_tops = mctx->asub_tops * 2;
4199       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4200                                                    re_sub_match_top_t *,
4201                                                    new_asub_tops);
4202       if (BE (new_array == NULL, 0))
4203         return REG_ESPACE;
4204       mctx->sub_tops = new_array;
4205       mctx->asub_tops = new_asub_tops;
4206     }
4207   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4208   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4209     return REG_ESPACE;
4210   mctx->sub_tops[mctx->nsub_tops]->node = node;
4211   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4212   return REG_NOERROR;
4213 }
4214
4215 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4216    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4217
4218 static re_sub_match_last_t *
4219 internal_function
4220 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4221 {
4222   re_sub_match_last_t *new_entry;
4223   if (BE (subtop->nlasts == subtop->alasts, 0))
4224     {
4225       int new_alasts = 2 * subtop->alasts + 1;
4226       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4227                                                     re_sub_match_last_t *,
4228                                                     new_alasts);
4229       if (BE (new_array == NULL, 0))
4230         return NULL;
4231       subtop->lasts = new_array;
4232       subtop->alasts = new_alasts;
4233     }
4234   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4235   if (BE (new_entry != NULL, 1))
4236     {
4237       subtop->lasts[subtop->nlasts] = new_entry;
4238       new_entry->node = node;
4239       new_entry->str_idx = str_idx;
4240       ++subtop->nlasts;
4241     }
4242   return new_entry;
4243 }
4244
4245 static void
4246 internal_function
4247 sift_ctx_init (re_sift_context_t *sctx,
4248                re_dfastate_t **sifted_sts,
4249                re_dfastate_t **limited_sts,
4250                int last_node, int last_str_idx)
4251 {
4252   sctx->sifted_states = sifted_sts;
4253   sctx->limited_states = limited_sts;
4254   sctx->last_node = last_node;
4255   sctx->last_str_idx = last_str_idx;
4256   re_node_set_init_empty (&sctx->limits);
4257 }