lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static inline re_dfastate_t *acquire_init_state_context
  55      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  56      __attribute ((always_inline)) internal_function;
  57 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  58      internal_function;
  59 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  60                            int *p_match_first)
  61      internal_function;
  62 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  63                                     unsigned int context) internal_function;
  64 static int check_halt_state_context (const re_match_context_t *mctx,
  65                                      const re_dfastate_t *state, int idx)
  66      internal_function;
  67 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  68                          regmatch_t *prev_idx_match, int cur_node,
  69                          int cur_idx, int nmatch) internal_function;
  70 static int proceed_next_node (const re_match_context_t *mctx,
  71                               int nregs, regmatch_t *regs,
  72                               int *pidx, int node, re_node_set *eps_via_nodes,
  73                               struct re_fail_stack_t *fs) internal_function;
  74 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  75                                       int str_idx, int dest_node, int nregs,
  76                                       regmatch_t *regs,
  77                                       re_node_set *eps_via_nodes) internal_function;
  78 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  79                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  80 static reg_errcode_t set_regs (const regex_t *preg,
  81                                const re_match_context_t *mctx,
  82                                size_t nmatch, regmatch_t *pmatch,
  83                                int fl_backtrack) internal_function;
  84 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  85
  86 #ifdef RE_ENABLE_I18N
  87 static int sift_states_iter_mb (const re_match_context_t *mctx,
  88                                 re_sift_context_t *sctx,
  89                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  90 #endif /* RE_ENABLE_I18N */
  91 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  92                                            re_sift_context_t *sctx) internal_function;
  93 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  94                                           re_sift_context_t *sctx, int str_idx,
  95                                           re_node_set *cur_dest) internal_function;
  96 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  97                                               re_sift_context_t *sctx,
  98                                               int str_idx,
  99                                               re_node_set *dest_nodes) internal_function;
 100 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 101                                             re_node_set *dest_nodes,
 102                                             const re_node_set *candidates) internal_function;
 103 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 104                                             re_node_set *dest_nodes,
 105                                             const re_node_set *and_nodes) internal_function;
 106 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 107                              int dst_node, int dst_idx, int src_node,
 108                              int src_idx) internal_function;
 109 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 110                                         int boundaries, int subexp_idx,
 111                                         int from_node, int bkref_idx) internal_function;
 112 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 113                                       int limit, int subexp_idx,
 114                                       int node, int str_idx,
 115                                       int bkref_idx) internal_function;
 116 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 117                                           re_node_set *dest_nodes,
 118                                           const re_node_set *candidates,
 119                                           re_node_set *limits,
 120                                           struct re_backref_cache_entry *bkref_ents,
 121                                           int str_idx) internal_function;
 122 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 123                                         re_sift_context_t *sctx,
 124                                         int str_idx, const re_node_set *candidates) internal_function;
 125 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 126                                                 int next_state_log_idx) internal_function;
 127 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 128                                         re_dfastate_t **src, int num) internal_function;
 129 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 130                                          re_match_context_t *mctx) internal_function;
 131 static re_dfastate_t *transit_state (reg_errcode_t *err,
 132                                      re_match_context_t *mctx,
 133                                      re_dfastate_t *state) internal_function;
 134 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 135                                             re_match_context_t *mctx,
 136                                             re_dfastate_t *next_state) internal_function;
 137 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 138                                                 re_node_set *cur_nodes,
 139                                                 int str_idx) internal_function;
 140 #if 0
 141 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 142                                         re_match_context_t *mctx,
 143                                         re_dfastate_t *pstate) internal_function;
 144 #endif
 145 #ifdef RE_ENABLE_I18N
 146 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 147                                        re_dfastate_t *pstate) internal_function;
 148 #endif /* RE_ENABLE_I18N */
 149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 150                                           const re_node_set *nodes) internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx) internal_function;
 153 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 154                                      const re_sub_match_top_t *sub_top,
 155                                      re_sub_match_last_t *sub_last,
 156                                      int bkref_node, int bkref_str) internal_function;
 157 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 158                              int subexp_idx, int type) internal_function;
 159 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 160                                     state_array_t *path, int top_node,
 161                                     int top_str, int last_node, int last_str,
 162                                     int type) internal_function;
 163 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 164                                                    int str_idx,
 165                                                    re_node_set *cur_nodes,
 166                                                    re_node_set *next_nodes) internal_function;
 167 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 168                                                re_node_set *cur_nodes,
 169                                                int ex_subexp, int type) internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 171                                                    re_node_set *dst_nodes,
 172                                                    int target, int ex_subexp,
 173                                                    int type) internal_function;
 174 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 175                                          re_node_set *cur_nodes, int cur_str,
 176                                          int subexp_num, int type) internal_function;
 177 static int build_trtable (re_dfa_t *dfa,
 178                           re_dfastate_t *state) internal_function;
 179 #ifdef RE_ENABLE_I18N
 180 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 181                                     const re_string_t *input, int idx) internal_function;
 182 # ifdef _LIBC
 183 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 184                                                    size_t name_len) internal_function;
 185 # endif /* _LIBC */
 186 #endif /* RE_ENABLE_I18N */
 187 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 188                                        const re_dfastate_t *state,
 189                                        re_node_set *states_node,
 190                                        bitset *states_ch) internal_function;
 191 static int check_node_accept (const re_match_context_t *mctx,
 192                               const re_token_t *node, int idx) internal_function;
 193 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 194 \f
 195 /* Entry point for POSIX code.  */
 196
 197 /* regexec searches for a given pattern, specified by PREG, in the
 198    string STRING.
 199
 200    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 201    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 202    least NMATCH elements, and we set them to the offsets of the
 203    corresponding matched substrings.
 204
 205    EFLAGS specifies `execution flags' which affect matching: if
 206    REG_NOTBOL is set, then ^ does not match at the beginning of the
 207    string; if REG_NOTEOL is set, then $ does not match at the end.
 208
 209    We return 0 if we find a match and REG_NOMATCH if not.  */
 210
 211 int
 212 regexec (const regex_t *__restrict preg, const char *__restrict string,
 213          size_t nmatch, regmatch_t pmatch[], int eflags)
 214 {
 215   reg_errcode_t err;
 216   int start, length;
 217 #ifdef _LIBC
 218   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 219 #endif
 220
 221   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 222     return REG_BADPAT;
 223
 224   if (eflags & REG_STARTEND)
 225     {
 226       start = pmatch[0].rm_so;
 227       length = pmatch[0].rm_eo;
 228     }
 229   else
 230     {
 231       start = 0;
 232       length = strlen (string);
 233     }
 234
 235   __libc_lock_lock (dfa->lock);
 236   if (preg->no_sub)
 237     err = re_search_internal (preg, string, length, start, length - start,
 238                               length, 0, NULL, eflags);
 239   else
 240     err = re_search_internal (preg, string, length, start, length - start,
 241                               length, nmatch, pmatch, eflags);
 242   __libc_lock_unlock (dfa->lock);
 243   return err != REG_NOERROR;
 244 }
 245
 246 #ifdef _LIBC
 247 # include <shlib-compat.h>
 248 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 249
 250 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 251 __typeof__ (__regexec) __compat_regexec;
 252
 253 int
 254 attribute_compat_text_section
 255 __compat_regexec (const regex_t *__restrict preg,
 256                   const char *__restrict string, size_t nmatch,
 257                   regmatch_t pmatch[], int eflags)
 258 {
 259   return regexec (preg, string, nmatch, pmatch,
 260                   eflags & (REG_NOTBOL | REG_NOTEOL));
 261 }
 262 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 263 # endif
 264 #endif
 265
 266 /* Entry points for GNU code.  */
 267
 268 /* re_match, re_search, re_match_2, re_search_2
 269
 270    The former two functions operate on STRING with length LENGTH,
 271    while the later two operate on concatenation of STRING1 and STRING2
 272    with lengths LENGTH1 and LENGTH2, respectively.
 273
 274    re_match() matches the compiled pattern in BUFP against the string,
 275    starting at index START.
 276
 277    re_search() first tries matching at index START, then it tries to match
 278    starting from index START + 1, and so on.  The last start position tried
 279    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 280    way as re_match().)
 281
 282    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 283    the first STOP characters of the concatenation of the strings should be
 284    concerned.
 285
 286    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 287    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 288    computed relative to the concatenation, not relative to the individual
 289    strings.)
 290
 291    On success, re_match* functions return the length of the match, re_search*
 292    return the position of the start of the match.  Return value -1 means no
 293    match was found and -2 indicates an internal error.  */
 294
 295 int
 296 re_match (struct re_pattern_buffer *bufp, const char *string,
 297           int length, int start, struct re_registers *regs)
 298 {
 299   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 300 }
 301 #ifdef _LIBC
 302 weak_alias (__re_match, re_match)
 303 #endif
 304
 305 int
 306 re_search (struct re_pattern_buffer *bufp, const char *string,
 307            int length, int start, int range, struct re_registers *regs)
 308 {
 309   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 310 }
 311 #ifdef _LIBC
 312 weak_alias (__re_search, re_search)
 313 #endif
 314
 315 int
 316 re_match_2 (struct re_pattern_buffer *bufp,
 317             const char *string1, int length1,
 318             const char *string2, int length2,
 319             int start, struct re_registers *regs, int stop)
 320 {
 321   return re_search_2_stub (bufp, string1, length1, string2, length2,
 322                            start, 0, regs, stop, 1);
 323 }
 324 #ifdef _LIBC
 325 weak_alias (__re_match_2, re_match_2)
 326 #endif
 327
 328 int
 329 re_search_2 (struct re_pattern_buffer *bufp,
 330              const char *string1, int length1,
 331              const char *string2, int length2,
 332              int start, int range, struct re_registers *regs, int stop)
 333 {
 334   return re_search_2_stub (bufp, string1, length1, string2, length2,
 335                            start, range, regs, stop, 0);
 336 }
 337 #ifdef _LIBC
 338 weak_alias (__re_search_2, re_search_2)
 339 #endif
 340
 341 static int
 342 internal_function
 343 re_search_2_stub (struct re_pattern_buffer *bufp,
 344                   const char *string1, int length1,
 345                   const char *string2, int length2,
 346                   int start, int range, struct re_registers *regs, int stop,
 347                   int ret_len)
 348 {
 349   const char *str;
 350   int rval;
 351   int len = length1 + length2;
 352   int free_str = 0;
 353
 354   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 355     return -2;
 356
 357   /* Concatenate the strings.  */
 358   if (length2 > 0)
 359     if (length1 > 0)
 360       {
 361         char *s = re_malloc (char, len);
 362
 363         if (BE (s == NULL, 0))
 364           return -2;
 365         memcpy (s, string1, length1);
 366         memcpy (s + length1, string2, length2);
 367         str = s;
 368         free_str = 1;
 369       }
 370     else
 371       str = string2;
 372   else
 373     str = string1;
 374
 375   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 376                          ret_len);
 377   if (free_str)
 378     re_free ((char *) str);
 379   return rval;
 380 }
 381
 382 /* The parameters have the same meaning as those of re_search.
 383    Additional parameters:
 384    If RET_LEN is nonzero the length of the match is returned (re_match style);
 385    otherwise the position of the match is returned.  */
 386
 387 static int
 388 internal_function
 389 re_search_stub (struct re_pattern_buffer *bufp,
 390                 const char *string, int length,
 391                 int start, int range, int stop, struct re_registers *regs,
 392                 int ret_len)
 393 {
 394   reg_errcode_t result;
 395   regmatch_t *pmatch;
 396   int nregs, rval;
 397   int eflags = 0;
 398 #ifdef _LIBC
 399   re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
 400 #endif
 401
 402   /* Check for out-of-range.  */
 403   if (BE (start < 0 || start > length, 0))
 404     return -1;
 405   if (BE (start + range > length, 0))
 406     range = length - start;
 407   else if (BE (start + range < 0, 0))
 408     range = -start;
 409
 410   __libc_lock_lock (dfa->lock);
 411
 412   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 413   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 414
 415   /* Compile fastmap if we haven't yet.  */
 416   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 417     re_compile_fastmap (bufp);
 418
 419   if (BE (bufp->no_sub, 0))
 420     regs = NULL;
 421
 422   /* We need at least 1 register.  */
 423   if (regs == NULL)
 424     nregs = 1;
 425   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 426                regs->num_regs < bufp->re_nsub + 1, 0))
 427     {
 428       nregs = regs->num_regs;
 429       if (BE (nregs < 1, 0))
 430         {
 431           /* Nothing can be copied to regs.  */
 432           regs = NULL;
 433           nregs = 1;
 434         }
 435     }
 436   else
 437     nregs = bufp->re_nsub + 1;
 438   pmatch = re_malloc (regmatch_t, nregs);
 439   if (BE (pmatch == NULL, 0))
 440     {
 441       rval = -2;
 442       goto out;
 443     }
 444
 445   result = re_search_internal (bufp, string, length, start, range, stop,
 446                                nregs, pmatch, eflags);
 447
 448   rval = 0;
 449
 450   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 451   if (result != REG_NOERROR)
 452     rval = -1;
 453   else if (regs != NULL)
 454     {
 455       /* If caller wants register contents data back, copy them.  */
 456       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 457                                            bufp->regs_allocated);
 458       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 459         rval = -2;
 460     }
 461
 462   if (BE (rval == 0, 1))
 463     {
 464       if (ret_len)
 465         {
 466           assert (pmatch[0].rm_so == start);
 467           rval = pmatch[0].rm_eo - start;
 468         }
 469       else
 470         rval = pmatch[0].rm_so;
 471     }
 472   re_free (pmatch);
 473  out:
 474   __libc_lock_unlock (dfa->lock);
 475   return rval;
 476 }
 477
 478 static unsigned
 479 internal_function
 480 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
 481               int regs_allocated)
 482 {
 483   int rval = REGS_REALLOCATE;
 484   int i;
 485   int need_regs = nregs + 1;
 486   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 487      uses.  */
 488
 489   /* Have the register data arrays been allocated?  */
 490   if (regs_allocated == REGS_UNALLOCATED)
 491     { /* No.  So allocate them with malloc.  */
 492       regs->start = re_malloc (regoff_t, need_regs);
 493       regs->end = re_malloc (regoff_t, need_regs);
 494       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 495         return REGS_UNALLOCATED;
 496       regs->num_regs = need_regs;
 497     }
 498   else if (regs_allocated == REGS_REALLOCATE)
 499     { /* Yes.  If we need more elements than were already
 500          allocated, reallocate them.  If we need fewer, just
 501          leave it alone.  */
 502       if (BE (need_regs > regs->num_regs, 0))
 503         {
 504           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 505           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 506           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 507             return REGS_UNALLOCATED;
 508           regs->start = new_start;
 509           regs->end = new_end;
 510           regs->num_regs = need_regs;
 511         }
 512     }
 513   else
 514     {
 515       assert (regs_allocated == REGS_FIXED);
 516       /* This function may not be called with REGS_FIXED and nregs too big.  */
 517       assert (regs->num_regs >= nregs);
 518       rval = REGS_FIXED;
 519     }
 520
 521   /* Copy the regs.  */
 522   for (i = 0; i < nregs; ++i)
 523     {
 524       regs->start[i] = pmatch[i].rm_so;
 525       regs->end[i] = pmatch[i].rm_eo;
 526     }
 527   for ( ; i < regs->num_regs; ++i)
 528     regs->start[i] = regs->end[i] = -1;
 529
 530   return rval;
 531 }
 532
 533 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 534    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 535    this memory for recording register information.  STARTS and ENDS
 536    must be allocated using the malloc library routine, and must each
 537    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 538
 539    If NUM_REGS == 0, then subsequent matches should allocate their own
 540    register data.
 541
 542    Unless this function is called, the first search or match using
 543    PATTERN_BUFFER will allocate its own register data, without
 544    freeing the old data.  */
 545
 546 void
 547 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 548                   unsigned int num_regs, regoff_t *starts, regoff_t *ends)
 549 {
 550   if (num_regs)
 551     {
 552       bufp->regs_allocated = REGS_REALLOCATE;
 553       regs->num_regs = num_regs;
 554       regs->start = starts;
 555       regs->end = ends;
 556     }
 557   else
 558     {
 559       bufp->regs_allocated = REGS_UNALLOCATED;
 560       regs->num_regs = 0;
 561       regs->start = regs->end = (regoff_t *) 0;
 562     }
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__re_set_registers, re_set_registers)
 566 #endif
 567 \f
 568 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 569    them unless specifically requested.  */
 570
 571 #if defined _REGEX_RE_COMP || defined _LIBC
 572 int
 573 # ifdef _LIBC
 574 weak_function
 575 # endif
 576 re_exec (s)
 577      const char *s;
 578 {
 579   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 580 }
 581 #endif /* _REGEX_RE_COMP */
 582 \f
 583 /* Internal entry point.  */
 584
 585 /* Searches for a compiled pattern PREG in the string STRING, whose
 586    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 587    mingings with regexec.  START, and RANGE have the same meanings
 588    with re_search.
 589    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 590    otherwise return the error code.
 591    Note: We assume front end functions already check ranges.
 592    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 593
 594 static reg_errcode_t
 595 internal_function
 596 re_search_internal (const regex_t *preg,
 597                     const char *string, int length,
 598                     int start, int range, int stop,
 599                     size_t nmatch, regmatch_t pmatch[],
 600                     int eflags)
 601 {
 602   reg_errcode_t err;
 603   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 604   int left_lim, right_lim, incr;
 605   int fl_longest_match, match_first, match_kind, match_last = -1;
 606   int extra_nmatch;
 607   int sb, ch;
 608 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 609   re_match_context_t mctx = { .dfa = dfa };
 610 #else
 611   re_match_context_t mctx;
 612 #endif
 613   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 614                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 615   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 616
 617 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 618   memset (&mctx, '\0', sizeof (re_match_context_t));
 619   mctx.dfa = dfa;
 620 #endif
 621
 622   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 623   nmatch -= extra_nmatch;
 624
 625   /* Check if the DFA haven't been compiled.  */
 626   if (BE (preg->used == 0 || dfa->init_state == NULL
 627           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 628           || dfa->init_state_begbuf == NULL, 0))
 629     return REG_NOMATCH;
 630
 631 #ifdef DEBUG
 632   /* We assume front-end functions already check them.  */
 633   assert (start + range >= 0 && start + range <= length);
 634 #endif
 635
 636   /* If initial states with non-begbuf contexts have no elements,
 637      the regex must be anchored.  If preg->newline_anchor is set,
 638      we'll never use init_state_nl, so do not check it.  */
 639   if (dfa->init_state->nodes.nelem == 0
 640       && dfa->init_state_word->nodes.nelem == 0
 641       && (dfa->init_state_nl->nodes.nelem == 0
 642           || !preg->newline_anchor))
 643     {
 644       if (start != 0 && start + range != 0)
 645         return REG_NOMATCH;
 646       start = range = 0;
 647     }
 648
 649   /* We must check the longest matching, if nmatch > 0.  */
 650   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 651
 652   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 653                             preg->translate, preg->syntax & RE_ICASE, dfa);
 654   if (BE (err != REG_NOERROR, 0))
 655     goto free_return;
 656   mctx.input.stop = stop;
 657   mctx.input.raw_stop = stop;
 658   mctx.input.newline_anchor = preg->newline_anchor;
 659
 660   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 661   if (BE (err != REG_NOERROR, 0))
 662     goto free_return;
 663
 664   /* We will log all the DFA states through which the dfa pass,
 665      if nmatch > 1, or this dfa has "multibyte node", which is a
 666      back-reference or a node which can accept multibyte character or
 667      multi character collating element.  */
 668   if (nmatch > 1 || dfa->has_mb_node)
 669     {
 670       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 671       if (BE (mctx.state_log == NULL, 0))
 672         {
 673           err = REG_ESPACE;
 674           goto free_return;
 675         }
 676     }
 677   else
 678     mctx.state_log = NULL;
 679
 680   match_first = start;
 681   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 682                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 683
 684   /* Check incrementally whether of not the input string match.  */
 685   incr = (range < 0) ? -1 : 1;
 686   left_lim = (range < 0) ? start + range : start;
 687   right_lim = (range < 0) ? start : start + range;
 688   sb = dfa->mb_cur_max == 1;
 689   match_kind =
 690     (fastmap
 691      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 692         | (range >= 0 ? 2 : 0)
 693         | (t != NULL ? 1 : 0))
 694      : 8);
 695
 696   for (;; match_first += incr)
 697     {
 698       err = REG_NOMATCH;
 699       if (match_first < left_lim || right_lim < match_first)
 700         goto free_return;
 701
 702       /* Advance as rapidly as possible through the string, until we
 703          find a plausible place to start matching.  This may be done
 704          with varying efficiency, so there are various possibilities:
 705          only the most common of them are specialized, in order to
 706          save on code size.  We use a switch statement for speed.  */
 707       switch (match_kind)
 708         {
 709         case 8:
 710           /* No fastmap.  */
 711           break;
 712
 713         case 7:
 714           /* Fastmap with single-byte translation, match forward.  */
 715           while (BE (match_first < right_lim, 1)
 716                  && !fastmap[t[(unsigned char) string[match_first]]])
 717             ++match_first;
 718           goto forward_match_found_start_or_reached_end;
 719
 720         case 6:
 721           /* Fastmap without translation, match forward.  */
 722           while (BE (match_first < right_lim, 1)
 723                  && !fastmap[(unsigned char) string[match_first]])
 724             ++match_first;
 725
 726         forward_match_found_start_or_reached_end:
 727           if (BE (match_first == right_lim, 0))
 728             {
 729               ch = match_first >= length
 730                        ? 0 : (unsigned char) string[match_first];
 731               if (!fastmap[t ? t[ch] : ch])
 732                 goto free_return;
 733             }
 734           break;
 735
 736         case 4:
 737         case 5:
 738           /* Fastmap without multi-byte translation, match backwards.  */
 739           while (match_first >= left_lim)
 740             {
 741               ch = match_first >= length
 742                        ? 0 : (unsigned char) string[match_first];
 743               if (fastmap[t ? t[ch] : ch])
 744                 break;
 745               --match_first;
 746             }
 747           if (match_first < left_lim)
 748             goto free_return;
 749           break;
 750
 751         default:
 752           /* In this case, we can't determine easily the current byte,
 753              since it might be a component byte of a multibyte
 754              character.  Then we use the constructed buffer instead.  */
 755           for (;;)
 756             {
 757               /* If MATCH_FIRST is out of the valid range, reconstruct the
 758                  buffers.  */
 759               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 760               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 761                 {
 762                   err = re_string_reconstruct (&mctx.input, match_first,
 763                                                eflags);
 764                   if (BE (err != REG_NOERROR, 0))
 765                     goto free_return;
 766
 767                   offset = match_first - mctx.input.raw_mbs_idx;
 768                 }
 769               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 770                  Note that MATCH_FIRST must not be smaller than 0.  */
 771               ch = (match_first >= length
 772                     ? 0 : re_string_byte_at (&mctx.input, offset));
 773               if (fastmap[ch])
 774                 break;
 775               match_first += incr;
 776               if (match_first < left_lim || match_first > right_lim)
 777                 {
 778                   err = REG_NOMATCH;
 779                   goto free_return;
 780                 }
 781             }
 782           break;
 783         }
 784
 785       /* Reconstruct the buffers so that the matcher can assume that
 786          the matching starts from the beginning of the buffer.  */
 787       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 788       if (BE (err != REG_NOERROR, 0))
 789         goto free_return;
 790
 791 #ifdef RE_ENABLE_I18N
 792      /* Don't consider this char as a possible match start if it part,
 793         yet isn't the head, of a multibyte character.  */
 794       if (!sb && !re_string_first_byte (&mctx.input, 0))
 795         continue;
 796 #endif
 797
 798       /* It seems to be appropriate one, then use the matcher.  */
 799       /* We assume that the matching starts from 0.  */
 800       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 801       match_last = check_matching (&mctx, fl_longest_match,
 802                                    range >= 0 ? &match_first : NULL);
 803       if (match_last != -1)
 804         {
 805           if (BE (match_last == -2, 0))
 806             {
 807               err = REG_ESPACE;
 808               goto free_return;
 809             }
 810           else
 811             {
 812               mctx.match_last = match_last;
 813               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 814                 {
 815                   re_dfastate_t *pstate = mctx.state_log[match_last];
 816                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 817                                                              match_last);
 818                 }
 819               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 820                   || dfa->nbackref)
 821                 {
 822                   err = prune_impossible_nodes (&mctx);
 823                   if (err == REG_NOERROR)
 824                     break;
 825                   if (BE (err != REG_NOMATCH, 0))
 826                     goto free_return;
 827                   match_last = -1;
 828                 }
 829               else
 830                 break; /* We found a match.  */
 831             }
 832         }
 833
 834       match_ctx_clean (&mctx);
 835     }
 836
 837 #ifdef DEBUG
 838   assert (match_last != -1);
 839   assert (err == REG_NOERROR);
 840 #endif
 841
 842   /* Set pmatch[] if we need.  */
 843   if (nmatch > 0)
 844     {
 845       int reg_idx;
 846
 847       /* Initialize registers.  */
 848       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 849         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 850
 851       /* Set the points where matching start/end.  */
 852       pmatch[0].rm_so = 0;
 853       pmatch[0].rm_eo = mctx.match_last;
 854
 855       if (!preg->no_sub && nmatch > 1)
 856         {
 857           err = set_regs (preg, &mctx, nmatch, pmatch,
 858                           dfa->has_plural_match && dfa->nbackref > 0);
 859           if (BE (err != REG_NOERROR, 0))
 860             goto free_return;
 861         }
 862
 863       /* At last, add the offset to the each registers, since we slided
 864          the buffers so that we could assume that the matching starts
 865          from 0.  */
 866       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 867         if (pmatch[reg_idx].rm_so != -1)
 868           {
 869 #ifdef RE_ENABLE_I18N
 870             if (BE (mctx.input.offsets_needed != 0, 0))
 871               {
 872                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 873                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 874                 else
 875                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 876                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 877                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 878                 else
 879                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 880               }
 881 #else
 882             assert (mctx.input.offsets_needed == 0);
 883 #endif
 884             pmatch[reg_idx].rm_so += match_first;
 885             pmatch[reg_idx].rm_eo += match_first;
 886           }
 887       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 888         {
 889           pmatch[nmatch + reg_idx].rm_so = -1;
 890           pmatch[nmatch + reg_idx].rm_eo = -1;
 891         }
 892
 893       if (dfa->subexp_map)
 894         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 895           if (dfa->subexp_map[reg_idx] != reg_idx)
 896             {
 897               pmatch[reg_idx + 1].rm_so
 898                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 899               pmatch[reg_idx + 1].rm_eo
 900                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 901             }
 902     }
 903
 904  free_return:
 905   re_free (mctx.state_log);
 906   if (dfa->nbackref)
 907     match_ctx_free (&mctx);
 908   re_string_destruct (&mctx.input);
 909   return err;
 910 }
 911
 912 static reg_errcode_t
 913 internal_function
 914 prune_impossible_nodes (re_match_context_t *mctx)
 915 {
 916   re_dfa_t *const dfa = mctx->dfa;
 917   int halt_node, match_last;
 918   reg_errcode_t ret;
 919   re_dfastate_t **sifted_states;
 920   re_dfastate_t **lim_states = NULL;
 921   re_sift_context_t sctx;
 922 #ifdef DEBUG
 923   assert (mctx->state_log != NULL);
 924 #endif
 925   match_last = mctx->match_last;
 926   halt_node = mctx->last_node;
 927   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 928   if (BE (sifted_states == NULL, 0))
 929     {
 930       ret = REG_ESPACE;
 931       goto free_return;
 932     }
 933   if (dfa->nbackref)
 934     {
 935       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 936       if (BE (lim_states == NULL, 0))
 937         {
 938           ret = REG_ESPACE;
 939           goto free_return;
 940         }
 941       while (1)
 942         {
 943           memset (lim_states, '\0',
 944                   sizeof (re_dfastate_t *) * (match_last + 1));
 945           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 946                          match_last);
 947           ret = sift_states_backward (mctx, &sctx);
 948           re_node_set_free (&sctx.limits);
 949           if (BE (ret != REG_NOERROR, 0))
 950               goto free_return;
 951           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 952             break;
 953           do
 954             {
 955               --match_last;
 956               if (match_last < 0)
 957                 {
 958                   ret = REG_NOMATCH;
 959                   goto free_return;
 960                 }
 961             } while (mctx->state_log[match_last] == NULL
 962                      || !mctx->state_log[match_last]->halt);
 963           halt_node = check_halt_state_context (mctx,
 964                                                 mctx->state_log[match_last],
 965                                                 match_last);
 966         }
 967       ret = merge_state_array (dfa, sifted_states, lim_states,
 968                                match_last + 1);
 969       re_free (lim_states);
 970       lim_states = NULL;
 971       if (BE (ret != REG_NOERROR, 0))
 972         goto free_return;
 973     }
 974   else
 975     {
 976       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 977       ret = sift_states_backward (mctx, &sctx);
 978       re_node_set_free (&sctx.limits);
 979       if (BE (ret != REG_NOERROR, 0))
 980         goto free_return;
 981     }
 982   re_free (mctx->state_log);
 983   mctx->state_log = sifted_states;
 984   sifted_states = NULL;
 985   mctx->last_node = halt_node;
 986   mctx->match_last = match_last;
 987   ret = REG_NOERROR;
 988  free_return:
 989   re_free (sifted_states);
 990   re_free (lim_states);
 991   return ret;
 992 }
 993
 994 /* Acquire an initial state and return it.
 995    We must select appropriate initial state depending on the context,
 996    since initial states may have constraints like "\<", "^", etc..  */
 997
 998 static inline re_dfastate_t *
 999 internal_function
1000 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1001                             int idx)
1002 {
1003   re_dfa_t *const dfa = mctx->dfa;
1004   if (dfa->init_state->has_constraint)
1005     {
1006       unsigned int context;
1007       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1008       if (IS_WORD_CONTEXT (context))
1009         return dfa->init_state_word;
1010       else if (IS_ORDINARY_CONTEXT (context))
1011         return dfa->init_state;
1012       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1013         return dfa->init_state_begbuf;
1014       else if (IS_NEWLINE_CONTEXT (context))
1015         return dfa->init_state_nl;
1016       else if (IS_BEGBUF_CONTEXT (context))
1017         {
1018           /* It is relatively rare case, then calculate on demand.  */
1019           return re_acquire_state_context (err, dfa,
1020                                            dfa->init_state->entrance_nodes,
1021                                            context);
1022         }
1023       else
1024         /* Must not happen?  */
1025         return dfa->init_state;
1026     }
1027   else
1028     return dfa->init_state;
1029 }
1030
1031 /* Check whether the regular expression match input string INPUT or not,
1032    and return the index where the matching end, return -1 if not match,
1033    or return -2 in case of an error.
1034    FL_LONGEST_MATCH means we want the POSIX longest matching.
1035    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1036    next place where we may want to try matching.
1037    Note that the matcher assume that the maching starts from the current
1038    index of the buffer.  */
1039
1040 static int
1041 internal_function
1042 check_matching (re_match_context_t *mctx, int fl_longest_match,
1043                 int *p_match_first)
1044 {
1045   re_dfa_t *const dfa = mctx->dfa;
1046   reg_errcode_t err;
1047   int match = 0;
1048   int match_last = -1;
1049   int cur_str_idx = re_string_cur_idx (&mctx->input);
1050   re_dfastate_t *cur_state;
1051   int at_init_state = p_match_first != NULL;
1052   int next_start_idx = cur_str_idx;
1053
1054   err = REG_NOERROR;
1055   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1056   /* An initial state must not be NULL (invalid).  */
1057   if (BE (cur_state == NULL, 0))
1058     {
1059       assert (err == REG_ESPACE);
1060       return -2;
1061     }
1062
1063   if (mctx->state_log != NULL)
1064     {
1065       mctx->state_log[cur_str_idx] = cur_state;
1066
1067       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1068          later.  E.g. Processing back references.  */
1069       if (BE (dfa->nbackref, 0))
1070         {
1071           at_init_state = 0;
1072           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1073           if (BE (err != REG_NOERROR, 0))
1074             return err;
1075
1076           if (cur_state->has_backref)
1077             {
1078               err = transit_state_bkref (mctx, &cur_state->nodes);
1079               if (BE (err != REG_NOERROR, 0))
1080                 return err;
1081             }
1082         }
1083     }
1084
1085   /* If the RE accepts NULL string.  */
1086   if (BE (cur_state->halt, 0))
1087     {
1088       if (!cur_state->has_constraint
1089           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1090         {
1091           if (!fl_longest_match)
1092             return cur_str_idx;
1093           else
1094             {
1095               match_last = cur_str_idx;
1096               match = 1;
1097             }
1098         }
1099     }
1100
1101   while (!re_string_eoi (&mctx->input))
1102     {
1103       re_dfastate_t *old_state = cur_state;
1104       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1105
1106       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1107           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1108               && mctx->input.valid_len < mctx->input.len))
1109         {
1110           err = extend_buffers (mctx);
1111           if (BE (err != REG_NOERROR, 0))
1112             {
1113               assert (err == REG_ESPACE);
1114               return -2;
1115             }
1116         }
1117
1118       cur_state = transit_state (&err, mctx, cur_state);
1119       if (mctx->state_log != NULL)
1120         cur_state = merge_state_with_log (&err, mctx, cur_state);
1121
1122       if (cur_state == NULL)
1123         {
1124           /* Reached the invalid state or an error.  Try to recover a valid
1125              state using the state log, if available and if we have not
1126              already found a valid (even if not the longest) match.  */
1127           if (BE (err != REG_NOERROR, 0))
1128             return -2;
1129
1130           if (mctx->state_log == NULL
1131               || (match && !fl_longest_match)
1132               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1133             break;
1134         }
1135
1136       if (BE (at_init_state, 0))
1137         {
1138           if (old_state == cur_state)
1139             next_start_idx = next_char_idx;
1140           else
1141             at_init_state = 0;
1142         }
1143
1144       if (cur_state->halt)
1145         {
1146           /* Reached a halt state.
1147              Check the halt state can satisfy the current context.  */
1148           if (!cur_state->has_constraint
1149               || check_halt_state_context (mctx, cur_state,
1150                                            re_string_cur_idx (&mctx->input)))
1151             {
1152               /* We found an appropriate halt state.  */
1153               match_last = re_string_cur_idx (&mctx->input);
1154               match = 1;
1155
1156               /* We found a match, do not modify match_first below.  */
1157               p_match_first = NULL;
1158               if (!fl_longest_match)
1159                 break;
1160             }
1161         }
1162     }
1163
1164   if (p_match_first)
1165     *p_match_first += next_start_idx;
1166
1167   return match_last;
1168 }
1169
1170 /* Check NODE match the current context.  */
1171
1172 static int
1173 internal_function
1174 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1175 {
1176   re_token_type_t type = dfa->nodes[node].type;
1177   unsigned int constraint = dfa->nodes[node].constraint;
1178   if (type != END_OF_RE)
1179     return 0;
1180   if (!constraint)
1181     return 1;
1182   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1183     return 0;
1184   return 1;
1185 }
1186
1187 /* Check the halt state STATE match the current context.
1188    Return 0 if not match, if the node, STATE has, is a halt node and
1189    match the context, return the node.  */
1190
1191 static int
1192 internal_function
1193 check_halt_state_context (const re_match_context_t *mctx,
1194                           const re_dfastate_t *state, int idx)
1195 {
1196   int i;
1197   unsigned int context;
1198 #ifdef DEBUG
1199   assert (state->halt);
1200 #endif
1201   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1202   for (i = 0; i < state->nodes.nelem; ++i)
1203     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1204       return state->nodes.elems[i];
1205   return 0;
1206 }
1207
1208 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1209    corresponding to the DFA).
1210    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1211    of errors.  */
1212
1213 static int
1214 internal_function
1215 proceed_next_node (const re_match_context_t *mctx,
1216                    int nregs, regmatch_t *regs, int *pidx, int node,
1217                    re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1218 {
1219   re_dfa_t *const dfa = mctx->dfa;
1220   int i, err;
1221   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1222     {
1223       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1224       re_node_set *edests = &dfa->edests[node];
1225       int dest_node;
1226       err = re_node_set_insert (eps_via_nodes, node);
1227       if (BE (err < 0, 0))
1228         return -2;
1229       /* Pick up a valid destination, or return -1 if none is found.  */
1230       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1231         {
1232           int candidate = edests->elems[i];
1233           if (!re_node_set_contains (cur_nodes, candidate))
1234             continue;
1235           if (dest_node == -1)
1236             dest_node = candidate;
1237
1238           else
1239             {
1240               /* In order to avoid infinite loop like "(a*)*", return the second
1241                  epsilon-transition if the first was already considered.  */
1242               if (re_node_set_contains (eps_via_nodes, dest_node))
1243                 return candidate;
1244
1245               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1246               else if (fs != NULL
1247                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1248                                            eps_via_nodes))
1249                 return -2;
1250
1251               /* We know we are going to exit.  */
1252               break;
1253             }
1254         }
1255       return dest_node;
1256     }
1257   else
1258     {
1259       int naccepted = 0;
1260       re_token_type_t type = dfa->nodes[node].type;
1261
1262 #ifdef RE_ENABLE_I18N
1263       if (dfa->nodes[node].accept_mb)
1264         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1265       else
1266 #endif /* RE_ENABLE_I18N */
1267       if (type == OP_BACK_REF)
1268         {
1269           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1270           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1271           if (fs != NULL)
1272             {
1273               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1274                 return -1;
1275               else if (naccepted)
1276                 {
1277                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1278                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1279                               naccepted) != 0)
1280                     return -1;
1281                 }
1282             }
1283
1284           if (naccepted == 0)
1285             {
1286               int dest_node;
1287               err = re_node_set_insert (eps_via_nodes, node);
1288               if (BE (err < 0, 0))
1289                 return -2;
1290               dest_node = dfa->edests[node].elems[0];
1291               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1292                                         dest_node))
1293                 return dest_node;
1294             }
1295         }
1296
1297       if (naccepted != 0
1298           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1299         {
1300           int dest_node = dfa->nexts[node];
1301           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1302           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1303                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1304                                                dest_node)))
1305             return -1;
1306           re_node_set_empty (eps_via_nodes);
1307           return dest_node;
1308         }
1309     }
1310   return -1;
1311 }
1312
1313 static reg_errcode_t
1314 internal_function
1315 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1316                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1317 {
1318   reg_errcode_t err;
1319   int num = fs->num++;
1320   if (fs->num == fs->alloc)
1321     {
1322       struct re_fail_stack_ent_t *new_array;
1323       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1324                                        * fs->alloc * 2));
1325       if (new_array == NULL)
1326         return REG_ESPACE;
1327       fs->alloc *= 2;
1328       fs->stack = new_array;
1329     }
1330   fs->stack[num].idx = str_idx;
1331   fs->stack[num].node = dest_node;
1332   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1333   if (fs->stack[num].regs == NULL)
1334     return REG_ESPACE;
1335   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1336   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1337   return err;
1338 }
1339
1340 static int
1341 internal_function
1342 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx,
1343                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1344 {
1345   int num = --fs->num;
1346   assert (num >= 0);
1347   *pidx = fs->stack[num].idx;
1348   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1349   re_node_set_free (eps_via_nodes);
1350   re_free (fs->stack[num].regs);
1351   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1352   return fs->stack[num].node;
1353 }
1354
1355 /* Set the positions where the subexpressions are starts/ends to registers
1356    PMATCH.
1357    Note: We assume that pmatch[0] is already set, and
1358    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1359
1360 static reg_errcode_t
1361 internal_function
1362 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1363           size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1364 {
1365   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1366   int idx, cur_node;
1367   re_node_set eps_via_nodes;
1368   struct re_fail_stack_t *fs;
1369   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1370   regmatch_t *prev_idx_match;
1371
1372 #ifdef DEBUG
1373   assert (nmatch > 1);
1374   assert (mctx->state_log != NULL);
1375 #endif
1376   if (fl_backtrack)
1377     {
1378       fs = &fs_body;
1379       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1380       if (fs->stack == NULL)
1381         return REG_ESPACE;
1382     }
1383   else
1384     fs = NULL;
1385
1386   cur_node = dfa->init_node;
1387   re_node_set_init_empty (&eps_via_nodes);
1388
1389   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1390   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1391
1392   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1393     {
1394       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1395
1396       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1397         {
1398           int reg_idx;
1399           if (fs)
1400             {
1401               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1402                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1403                   break;
1404               if (reg_idx == nmatch)
1405                 {
1406                   re_node_set_free (&eps_via_nodes);
1407                   return free_fail_stack_return (fs);
1408                 }
1409               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1410                                          &eps_via_nodes);
1411             }
1412           else
1413             {
1414               re_node_set_free (&eps_via_nodes);
1415               return REG_NOERROR;
1416             }
1417         }
1418
1419       /* Proceed to next node.  */
1420       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1421                                     &eps_via_nodes, fs);
1422
1423       if (BE (cur_node < 0, 0))
1424         {
1425           if (BE (cur_node == -2, 0))
1426             {
1427               re_node_set_free (&eps_via_nodes);
1428               free_fail_stack_return (fs);
1429               return REG_ESPACE;
1430             }
1431           if (fs)
1432             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1433                                        &eps_via_nodes);
1434           else
1435             {
1436               re_node_set_free (&eps_via_nodes);
1437               return REG_NOMATCH;
1438             }
1439         }
1440     }
1441   re_node_set_free (&eps_via_nodes);
1442   return free_fail_stack_return (fs);
1443 }
1444
1445 static reg_errcode_t
1446 internal_function
1447 free_fail_stack_return (struct re_fail_stack_t *fs)
1448 {
1449   if (fs)
1450     {
1451       int fs_idx;
1452       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1453         {
1454           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1455           re_free (fs->stack[fs_idx].regs);
1456         }
1457       re_free (fs->stack);
1458     }
1459   return REG_NOERROR;
1460 }
1461
1462 static void
1463 internal_function
1464 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1465              int cur_node, int cur_idx, int nmatch)
1466 {
1467   int type = dfa->nodes[cur_node].type;
1468   if (type == OP_OPEN_SUBEXP)
1469     {
1470       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1471
1472       /* We are at the first node of this sub expression.  */
1473       if (reg_num < nmatch)
1474         {
1475           pmatch[reg_num].rm_so = cur_idx;
1476           pmatch[reg_num].rm_eo = -1;
1477         }
1478     }
1479   else if (type == OP_CLOSE_SUBEXP)
1480     {
1481       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1482       if (reg_num < nmatch)
1483         {
1484           /* We are at the last node of this sub expression.  */
1485           if (pmatch[reg_num].rm_so < cur_idx)
1486             {
1487               pmatch[reg_num].rm_eo = cur_idx;
1488               /* This is a non-empty match or we are not inside an optional
1489                  subexpression.  Accept this right away.  */
1490               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1491             }
1492           else
1493             {
1494               if (dfa->nodes[cur_node].opt_subexp
1495                   && prev_idx_match[reg_num].rm_so != -1)
1496                 /* We transited through an empty match for an optional
1497                    subexpression, like (a?)*, and this is not the subexp's
1498                    first match.  Copy back the old content of the registers
1499                    so that matches of an inner subexpression are undone as
1500                    well, like in ((a?))*.  */
1501                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1502               else
1503                 /* We completed a subexpression, but it may be part of
1504                    an optional one, so do not update PREV_IDX_MATCH.  */
1505                 pmatch[reg_num].rm_eo = cur_idx;
1506             }
1507         }
1508     }
1509 }
1510
1511 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1512    and sift the nodes in each states according to the following rules.
1513    Updated state_log will be wrote to STATE_LOG.
1514
1515    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1516      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1517         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1518         the LAST_NODE, we throw away the node `a'.
1519      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1520         string `s' and transit to `b':
1521         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1522            away the node `a'.
1523         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1524             thrown away, we throw away the node `a'.
1525      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1526         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1527            node `a'.
1528         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1529             we throw away the node `a'.  */
1530
1531 #define STATE_NODE_CONTAINS(state,node) \
1532   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1533
1534 static reg_errcode_t
1535 internal_function
1536 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1537 {
1538   reg_errcode_t err;
1539   int null_cnt = 0;
1540   int str_idx = sctx->last_str_idx;
1541   re_node_set cur_dest;
1542
1543 #ifdef DEBUG
1544   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1545 #endif
1546
1547   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1548      transit to the last_node and the last_node itself.  */
1549   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1550   if (BE (err != REG_NOERROR, 0))
1551     return err;
1552   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1553   if (BE (err != REG_NOERROR, 0))
1554     goto free_return;
1555
1556   /* Then check each states in the state_log.  */
1557   while (str_idx > 0)
1558     {
1559       /* Update counters.  */
1560       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1561       if (null_cnt > mctx->max_mb_elem_len)
1562         {
1563           memset (sctx->sifted_states, '\0',
1564                   sizeof (re_dfastate_t *) * str_idx);
1565           re_node_set_free (&cur_dest);
1566           return REG_NOERROR;
1567         }
1568       re_node_set_empty (&cur_dest);
1569       --str_idx;
1570
1571       if (mctx->state_log[str_idx])
1572         {
1573           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1574           if (BE (err != REG_NOERROR, 0))
1575             goto free_return;
1576         }
1577
1578       /* Add all the nodes which satisfy the following conditions:
1579          - It can epsilon transit to a node in CUR_DEST.
1580          - It is in CUR_SRC.
1581          And update state_log.  */
1582       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1583       if (BE (err != REG_NOERROR, 0))
1584         goto free_return;
1585     }
1586   err = REG_NOERROR;
1587  free_return:
1588   re_node_set_free (&cur_dest);
1589   return err;
1590 }
1591
1592 static reg_errcode_t
1593 internal_function
1594 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1595                      int str_idx, re_node_set *cur_dest)
1596 {
1597   re_dfa_t *const dfa = mctx->dfa;
1598   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1599   int i;
1600
1601   /* Then build the next sifted state.
1602      We build the next sifted state on `cur_dest', and update
1603      `sifted_states[str_idx]' with `cur_dest'.
1604      Note:
1605      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1606      `cur_src' points the node_set of the old `state_log[str_idx]'
1607      (with the epsilon nodes pre-filtered out).  */
1608   for (i = 0; i < cur_src->nelem; i++)
1609     {
1610       int prev_node = cur_src->elems[i];
1611       int naccepted = 0;
1612       int ret;
1613
1614 #ifdef DEBUG
1615       re_token_type_t type = dfa->nodes[prev_node].type;
1616       assert (!IS_EPSILON_NODE (type));
1617 #endif
1618 #ifdef RE_ENABLE_I18N
1619       /* If the node may accept `multi byte'.  */
1620       if (dfa->nodes[prev_node].accept_mb)
1621         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1622                                          str_idx, sctx->last_str_idx);
1623 #endif /* RE_ENABLE_I18N */
1624
1625       /* We don't check backreferences here.
1626          See update_cur_sifted_state().  */
1627       if (!naccepted
1628           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1629           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1630                                   dfa->nexts[prev_node]))
1631         naccepted = 1;
1632
1633       if (naccepted == 0)
1634         continue;
1635
1636       if (sctx->limits.nelem)
1637         {
1638           int to_idx = str_idx + naccepted;
1639           if (check_dst_limits (mctx, &sctx->limits,
1640                                 dfa->nexts[prev_node], to_idx,
1641                                 prev_node, str_idx))
1642             continue;
1643         }
1644       ret = re_node_set_insert (cur_dest, prev_node);
1645       if (BE (ret == -1, 0))
1646         return REG_ESPACE;
1647     }
1648
1649   return REG_NOERROR;
1650 }
1651
1652 /* Helper functions.  */
1653
1654 static reg_errcode_t
1655 internal_function
1656 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1657 {
1658   int top = mctx->state_log_top;
1659
1660   if (next_state_log_idx >= mctx->input.bufs_len
1661       || (next_state_log_idx >= mctx->input.valid_len
1662           && mctx->input.valid_len < mctx->input.len))
1663     {
1664       reg_errcode_t err;
1665       err = extend_buffers (mctx);
1666       if (BE (err != REG_NOERROR, 0))
1667         return err;
1668     }
1669
1670   if (top < next_state_log_idx)
1671     {
1672       memset (mctx->state_log + top + 1, '\0',
1673               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1674       mctx->state_log_top = next_state_log_idx;
1675     }
1676   return REG_NOERROR;
1677 }
1678
1679 static reg_errcode_t
1680 internal_function
1681 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1682                    int num)
1683 {
1684   int st_idx;
1685   reg_errcode_t err;
1686   for (st_idx = 0; st_idx < num; ++st_idx)
1687     {
1688       if (dst[st_idx] == NULL)
1689         dst[st_idx] = src[st_idx];
1690       else if (src[st_idx] != NULL)
1691         {
1692           re_node_set merged_set;
1693           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1694                                         &src[st_idx]->nodes);
1695           if (BE (err != REG_NOERROR, 0))
1696             return err;
1697           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1698           re_node_set_free (&merged_set);
1699           if (BE (err != REG_NOERROR, 0))
1700             return err;
1701         }
1702     }
1703   return REG_NOERROR;
1704 }
1705
1706 static reg_errcode_t
1707 internal_function
1708 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1709                          int str_idx, re_node_set *dest_nodes)
1710 {
1711   re_dfa_t *const dfa = mctx->dfa;
1712   reg_errcode_t err;
1713   const re_node_set *candidates;
1714   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1715                 : &mctx->state_log[str_idx]->nodes);
1716
1717   if (dest_nodes->nelem == 0)
1718     sctx->sifted_states[str_idx] = NULL;
1719   else
1720     {
1721       if (candidates)
1722         {
1723           /* At first, add the nodes which can epsilon transit to a node in
1724              DEST_NODE.  */
1725           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1726           if (BE (err != REG_NOERROR, 0))
1727             return err;
1728
1729           /* Then, check the limitations in the current sift_context.  */
1730           if (sctx->limits.nelem)
1731             {
1732               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1733                                          mctx->bkref_ents, str_idx);
1734               if (BE (err != REG_NOERROR, 0))
1735                 return err;
1736             }
1737         }
1738
1739       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1740       if (BE (err != REG_NOERROR, 0))
1741         return err;
1742     }
1743
1744   if (candidates && mctx->state_log[str_idx]->has_backref)
1745     {
1746       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1747       if (BE (err != REG_NOERROR, 0))
1748         return err;
1749     }
1750   return REG_NOERROR;
1751 }
1752
1753 static reg_errcode_t
1754 internal_function
1755 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1756                        const re_node_set *candidates)
1757 {
1758   reg_errcode_t err = REG_NOERROR;
1759   int i;
1760
1761   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1762   if (BE (err != REG_NOERROR, 0))
1763     return err;
1764
1765   if (!state->inveclosure.alloc)
1766     {
1767       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1768       if (BE (err != REG_NOERROR, 0))
1769         return REG_ESPACE;
1770       for (i = 0; i < dest_nodes->nelem; i++)
1771         re_node_set_merge (&state->inveclosure,
1772                            dfa->inveclosures + dest_nodes->elems[i]);
1773     }
1774   return re_node_set_add_intersect (dest_nodes, candidates,
1775                                     &state->inveclosure);
1776 }
1777
1778 static reg_errcode_t
1779 internal_function
1780 sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1781                        const re_node_set *candidates)
1782 {
1783     int ecl_idx;
1784     reg_errcode_t err;
1785     re_node_set *inv_eclosure = dfa->inveclosures + node;
1786     re_node_set except_nodes;
1787     re_node_set_init_empty (&except_nodes);
1788     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1789       {
1790         int cur_node = inv_eclosure->elems[ecl_idx];
1791         if (cur_node == node)
1792           continue;
1793         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1794           {
1795             int edst1 = dfa->edests[cur_node].elems[0];
1796             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1797                          ? dfa->edests[cur_node].elems[1] : -1);
1798             if ((!re_node_set_contains (inv_eclosure, edst1)
1799                  && re_node_set_contains (dest_nodes, edst1))
1800                 || (edst2 > 0
1801                     && !re_node_set_contains (inv_eclosure, edst2)
1802                     && re_node_set_contains (dest_nodes, edst2)))
1803               {
1804                 err = re_node_set_add_intersect (&except_nodes, candidates,
1805                                                  dfa->inveclosures + cur_node);
1806                 if (BE (err != REG_NOERROR, 0))
1807                   {
1808                     re_node_set_free (&except_nodes);
1809                     return err;
1810                   }
1811               }
1812           }
1813       }
1814     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1815       {
1816         int cur_node = inv_eclosure->elems[ecl_idx];
1817         if (!re_node_set_contains (&except_nodes, cur_node))
1818           {
1819             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1820             re_node_set_remove_at (dest_nodes, idx);
1821           }
1822       }
1823     re_node_set_free (&except_nodes);
1824     return REG_NOERROR;
1825 }
1826
1827 static int
1828 internal_function
1829 check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
1830                   int dst_node, int dst_idx, int src_node, int src_idx)
1831 {
1832   re_dfa_t *const dfa = mctx->dfa;
1833   int lim_idx, src_pos, dst_pos;
1834
1835   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1836   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1837   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1838     {
1839       int subexp_idx;
1840       struct re_backref_cache_entry *ent;
1841       ent = mctx->bkref_ents + limits->elems[lim_idx];
1842       subexp_idx = dfa->nodes[ent->node].opr.idx;
1843
1844       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1845                                            subexp_idx, dst_node, dst_idx,
1846                                            dst_bkref_idx);
1847       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1848                                            subexp_idx, src_node, src_idx,
1849                                            src_bkref_idx);
1850
1851       /* In case of:
1852          <src> <dst> ( <subexp> )
1853          ( <subexp> ) <src> <dst>
1854          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1855       if (src_pos == dst_pos)
1856         continue; /* This is unrelated limitation.  */
1857       else
1858         return 1;
1859     }
1860   return 0;
1861 }
1862
1863 static int
1864 internal_function
1865 check_dst_limits_calc_pos_1 (re_match_context_t *mctx, int boundaries,
1866                              int subexp_idx, int from_node, int bkref_idx)
1867 {
1868   re_dfa_t *const dfa = mctx->dfa;
1869   re_node_set *eclosures = dfa->eclosures + from_node;
1870   int node_idx;
1871
1872   /* Else, we are on the boundary: examine the nodes on the epsilon
1873      closure.  */
1874   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1875     {
1876       int node = eclosures->elems[node_idx];
1877       switch (dfa->nodes[node].type)
1878         {
1879         case OP_BACK_REF:
1880           if (bkref_idx != -1)
1881             {
1882               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1883               do
1884                 {
1885                   int dst, cpos;
1886
1887                   if (ent->node != node)
1888                     continue;
1889
1890                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1891                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1892                     continue;
1893
1894                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1895                      OP_CLOSE_SUBEXP cases below.  But, if the
1896                      destination node is the same node as the source
1897                      node, don't recurse because it would cause an
1898                      infinite loop: a regex that exhibits this behavior
1899                      is ()\1*\1*  */
1900                   dst = dfa->edests[node].elems[0];
1901                   if (dst == from_node)
1902                     {
1903                       if (boundaries & 1)
1904                         return -1;
1905                       else /* if (boundaries & 2) */
1906                         return 0;
1907                     }
1908
1909                   cpos =
1910                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1911                                                  dst, bkref_idx);
1912                   if (cpos == -1 /* && (boundaries & 1) */)
1913                     return -1;
1914                   if (cpos == 0 && (boundaries & 2))
1915                     return 0;
1916
1917                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1918                 }
1919               while (ent++->more);
1920             }
1921           break;
1922
1923         case OP_OPEN_SUBEXP:
1924           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1925             return -1;
1926           break;
1927
1928         case OP_CLOSE_SUBEXP:
1929           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1930             return 0;
1931           break;
1932
1933         default:
1934             break;
1935         }
1936     }
1937
1938   return (boundaries & 2) ? 1 : 0;
1939 }
1940
1941 static int
1942 internal_function
1943 check_dst_limits_calc_pos (re_match_context_t *mctx, int limit, int subexp_idx,
1944                            int from_node, int str_idx, int bkref_idx)
1945 {
1946   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1947   int boundaries;
1948
1949   /* If we are outside the range of the subexpression, return -1 or 1.  */
1950   if (str_idx < lim->subexp_from)
1951     return -1;
1952
1953   if (lim->subexp_to < str_idx)
1954     return 1;
1955
1956   /* If we are within the subexpression, return 0.  */
1957   boundaries = (str_idx == lim->subexp_from);
1958   boundaries |= (str_idx == lim->subexp_to) << 1;
1959   if (boundaries == 0)
1960     return 0;
1961
1962   /* Else, examine epsilon closure.  */
1963   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1964                                       from_node, bkref_idx);
1965 }
1966
1967 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1968    which are against limitations from DEST_NODES. */
1969
1970 static reg_errcode_t
1971 internal_function
1972 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
1973                      const re_node_set *candidates, re_node_set *limits,
1974                      struct re_backref_cache_entry *bkref_ents, int str_idx)
1975 {
1976   reg_errcode_t err;
1977   int node_idx, lim_idx;
1978
1979   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1980     {
1981       int subexp_idx;
1982       struct re_backref_cache_entry *ent;
1983       ent = bkref_ents + limits->elems[lim_idx];
1984
1985       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1986         continue; /* This is unrelated limitation.  */
1987
1988       subexp_idx = dfa->nodes[ent->node].opr.idx;
1989       if (ent->subexp_to == str_idx)
1990         {
1991           int ops_node = -1;
1992           int cls_node = -1;
1993           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1994             {
1995               int node = dest_nodes->elems[node_idx];
1996               re_token_type_t type = dfa->nodes[node].type;
1997               if (type == OP_OPEN_SUBEXP
1998                   && subexp_idx == dfa->nodes[node].opr.idx)
1999                 ops_node = node;
2000               else if (type == OP_CLOSE_SUBEXP
2001                        && subexp_idx == dfa->nodes[node].opr.idx)
2002                 cls_node = node;
2003             }
2004
2005           /* Check the limitation of the open subexpression.  */
2006           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2007           if (ops_node >= 0)
2008             {
2009               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2010                                            candidates);
2011               if (BE (err != REG_NOERROR, 0))
2012                 return err;
2013             }
2014
2015           /* Check the limitation of the close subexpression.  */
2016           if (cls_node >= 0)
2017             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2018               {
2019                 int node = dest_nodes->elems[node_idx];
2020                 if (!re_node_set_contains (dfa->inveclosures + node,
2021                                            cls_node)
2022                     && !re_node_set_contains (dfa->eclosures + node,
2023                                               cls_node))
2024                   {
2025                     /* It is against this limitation.
2026                        Remove it form the current sifted state.  */
2027                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2028                                                  candidates);
2029                     if (BE (err != REG_NOERROR, 0))
2030                       return err;
2031                     --node_idx;
2032                   }
2033               }
2034         }
2035       else /* (ent->subexp_to != str_idx)  */
2036         {
2037           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2038             {
2039               int node = dest_nodes->elems[node_idx];
2040               re_token_type_t type = dfa->nodes[node].type;
2041               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2042                 {
2043                   if (subexp_idx != dfa->nodes[node].opr.idx)
2044                     continue;
2045                   /* It is against this limitation.
2046                      Remove it form the current sifted state.  */
2047                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2048                                                candidates);
2049                   if (BE (err != REG_NOERROR, 0))
2050                     return err;
2051                 }
2052             }
2053         }
2054     }
2055   return REG_NOERROR;
2056 }
2057
2058 static reg_errcode_t
2059 internal_function
2060 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2061                    int str_idx, const re_node_set *candidates)
2062 {
2063   re_dfa_t *const dfa = mctx->dfa;
2064   reg_errcode_t err;
2065   int node_idx, node;
2066   re_sift_context_t local_sctx;
2067   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2068
2069   if (first_idx == -1)
2070     return REG_NOERROR;
2071
2072   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2073
2074   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2075     {
2076       int enabled_idx;
2077       re_token_type_t type;
2078       struct re_backref_cache_entry *entry;
2079       node = candidates->elems[node_idx];
2080       type = dfa->nodes[node].type;
2081       /* Avoid infinite loop for the REs like "()\1+".  */
2082       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2083         continue;
2084       if (type != OP_BACK_REF)
2085         continue;
2086
2087       entry = mctx->bkref_ents + first_idx;
2088       enabled_idx = first_idx;
2089       do
2090         {
2091           int subexp_len, to_idx, dst_node;
2092           re_dfastate_t *cur_state;
2093
2094           if (entry->node != node)
2095             continue;
2096           subexp_len = entry->subexp_to - entry->subexp_from;
2097           to_idx = str_idx + subexp_len;
2098           dst_node = (subexp_len ? dfa->nexts[node]
2099                       : dfa->edests[node].elems[0]);
2100
2101           if (to_idx > sctx->last_str_idx
2102               || sctx->sifted_states[to_idx] == NULL
2103               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2104               || check_dst_limits (mctx, &sctx->limits, node,
2105                                    str_idx, dst_node, to_idx))
2106             continue;
2107
2108           if (local_sctx.sifted_states == NULL)
2109             {
2110               local_sctx = *sctx;
2111               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2112               if (BE (err != REG_NOERROR, 0))
2113                 goto free_return;
2114             }
2115           local_sctx.last_node = node;
2116           local_sctx.last_str_idx = str_idx;
2117           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2118           if (BE (err < 0, 0))
2119             {
2120               err = REG_ESPACE;
2121               goto free_return;
2122             }
2123           cur_state = local_sctx.sifted_states[str_idx];
2124           err = sift_states_backward (mctx, &local_sctx);
2125           if (BE (err != REG_NOERROR, 0))
2126             goto free_return;
2127           if (sctx->limited_states != NULL)
2128             {
2129               err = merge_state_array (dfa, sctx->limited_states,
2130                                        local_sctx.sifted_states,
2131                                        str_idx + 1);
2132               if (BE (err != REG_NOERROR, 0))
2133                 goto free_return;
2134             }
2135           local_sctx.sifted_states[str_idx] = cur_state;
2136           re_node_set_remove (&local_sctx.limits, enabled_idx);
2137
2138           /* mctx->bkref_ents may have changed, reload the pointer.  */
2139           entry = mctx->bkref_ents + enabled_idx;
2140         }
2141       while (enabled_idx++, entry++->more);
2142     }
2143   err = REG_NOERROR;
2144  free_return:
2145   if (local_sctx.sifted_states != NULL)
2146     {
2147       re_node_set_free (&local_sctx.limits);
2148     }
2149
2150   return err;
2151 }
2152
2153
2154 #ifdef RE_ENABLE_I18N
2155 static int
2156 internal_function
2157 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2158                      int node_idx, int str_idx, int max_str_idx)
2159 {
2160   re_dfa_t *const dfa = mctx->dfa;
2161   int naccepted;
2162   /* Check the node can accept `multi byte'.  */
2163   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2164   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2165       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2166                             dfa->nexts[node_idx]))
2167     /* The node can't accept the `multi byte', or the
2168        destination was already thrown away, then the node
2169        could't accept the current input `multi byte'.   */
2170     naccepted = 0;
2171   /* Otherwise, it is sure that the node could accept
2172      `naccepted' bytes input.  */
2173   return naccepted;
2174 }
2175 #endif /* RE_ENABLE_I18N */
2176
2177 \f
2178 /* Functions for state transition.  */
2179
2180 /* Return the next state to which the current state STATE will transit by
2181    accepting the current input byte, and update STATE_LOG if necessary.
2182    If STATE can accept a multibyte char/collating element/back reference
2183    update the destination of STATE_LOG.  */
2184
2185 static re_dfastate_t *
2186 internal_function
2187 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2188                re_dfastate_t *state)
2189 {
2190   re_dfastate_t **trtable;
2191   unsigned char ch;
2192
2193 #ifdef RE_ENABLE_I18N
2194   /* If the current state can accept multibyte.  */
2195   if (BE (state->accept_mb, 0))
2196     {
2197       *err = transit_state_mb (mctx, state);
2198       if (BE (*err != REG_NOERROR, 0))
2199         return NULL;
2200     }
2201 #endif /* RE_ENABLE_I18N */
2202
2203   /* Then decide the next state with the single byte.  */
2204 #if 0
2205   if (0)
2206     /* don't use transition table  */
2207     return transit_state_sb (err, mctx, state);
2208 #endif
2209
2210   /* Use transition table  */
2211   ch = re_string_fetch_byte (&mctx->input);
2212   for (;;)
2213     {
2214       trtable = state->trtable;
2215       if (BE (trtable != NULL, 1))
2216         return trtable[ch];
2217
2218       trtable = state->word_trtable;
2219       if (BE (trtable != NULL, 1))
2220         {
2221           unsigned int context;
2222           context
2223             = re_string_context_at (&mctx->input,
2224                                     re_string_cur_idx (&mctx->input) - 1,
2225                                     mctx->eflags);
2226           if (IS_WORD_CONTEXT (context))
2227             return trtable[ch + SBC_MAX];
2228           else
2229             return trtable[ch];
2230         }
2231
2232       if (!build_trtable (mctx->dfa, state))
2233         {
2234           *err = REG_ESPACE;
2235           return NULL;
2236         }
2237
2238       /* Retry, we now have a transition table.  */
2239     }
2240 }
2241
2242 /* Update the state_log if we need */
2243 re_dfastate_t *
2244 internal_function
2245 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2246                       re_dfastate_t *next_state)
2247 {
2248   re_dfa_t *const dfa = mctx->dfa;
2249   int cur_idx = re_string_cur_idx (&mctx->input);
2250
2251   if (cur_idx > mctx->state_log_top)
2252     {
2253       mctx->state_log[cur_idx] = next_state;
2254       mctx->state_log_top = cur_idx;
2255     }
2256   else if (mctx->state_log[cur_idx] == 0)
2257     {
2258       mctx->state_log[cur_idx] = next_state;
2259     }
2260   else
2261     {
2262       re_dfastate_t *pstate;
2263       unsigned int context;
2264       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2265       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2266          the destination of a multibyte char/collating element/
2267          back reference.  Then the next state is the union set of
2268          these destinations and the results of the transition table.  */
2269       pstate = mctx->state_log[cur_idx];
2270       log_nodes = pstate->entrance_nodes;
2271       if (next_state != NULL)
2272         {
2273           table_nodes = next_state->entrance_nodes;
2274           *err = re_node_set_init_union (&next_nodes, table_nodes,
2275                                              log_nodes);
2276           if (BE (*err != REG_NOERROR, 0))
2277             return NULL;
2278         }
2279       else
2280         next_nodes = *log_nodes;
2281       /* Note: We already add the nodes of the initial state,
2282          then we don't need to add them here.  */
2283
2284       context = re_string_context_at (&mctx->input,
2285                                       re_string_cur_idx (&mctx->input) - 1,
2286                                       mctx->eflags);
2287       next_state = mctx->state_log[cur_idx]
2288         = re_acquire_state_context (err, dfa, &next_nodes, context);
2289       /* We don't need to check errors here, since the return value of
2290          this function is next_state and ERR is already set.  */
2291
2292       if (table_nodes != NULL)
2293         re_node_set_free (&next_nodes);
2294     }
2295
2296   if (BE (dfa->nbackref, 0) && next_state != NULL)
2297     {
2298       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2299          later.  We must check them here, since the back references in the
2300          next state might use them.  */
2301       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2302                                         cur_idx);
2303       if (BE (*err != REG_NOERROR, 0))
2304         return NULL;
2305
2306       /* If the next state has back references.  */
2307       if (next_state->has_backref)
2308         {
2309           *err = transit_state_bkref (mctx, &next_state->nodes);
2310           if (BE (*err != REG_NOERROR, 0))
2311             return NULL;
2312           next_state = mctx->state_log[cur_idx];
2313         }
2314     }
2315
2316   return next_state;
2317 }
2318
2319 /* Skip bytes in the input that correspond to part of a
2320    multi-byte match, then look in the log for a state
2321    from which to restart matching.  */
2322 static re_dfastate_t *
2323 internal_function
2324 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2325 {
2326   re_dfastate_t *cur_state = NULL;
2327   do
2328     {
2329       int max = mctx->state_log_top;
2330       int cur_str_idx = re_string_cur_idx (&mctx->input);
2331
2332       do
2333         {
2334           if (++cur_str_idx > max)
2335             return NULL;
2336           re_string_skip_bytes (&mctx->input, 1);
2337         }
2338       while (mctx->state_log[cur_str_idx] == NULL);
2339
2340       cur_state = merge_state_with_log (err, mctx, NULL);
2341     }
2342   while (err == REG_NOERROR && cur_state == NULL);
2343   return cur_state;
2344 }
2345
2346 /* Helper functions for transit_state.  */
2347
2348 /* From the node set CUR_NODES, pick up the nodes whose types are
2349    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2350    expression. And register them to use them later for evaluating the
2351    correspoding back references.  */
2352
2353 static reg_errcode_t
2354 internal_function
2355 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2356                            int str_idx)
2357 {
2358   re_dfa_t *const dfa = mctx->dfa;
2359   int node_idx;
2360   reg_errcode_t err;
2361
2362   /* TODO: This isn't efficient.
2363            Because there might be more than one nodes whose types are
2364            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2365            nodes.
2366            E.g. RE: (a){2}  */
2367   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2368     {
2369       int node = cur_nodes->elems[node_idx];
2370       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2371           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2372           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2373         {
2374           err = match_ctx_add_subtop (mctx, node, str_idx);
2375           if (BE (err != REG_NOERROR, 0))
2376             return err;
2377         }
2378     }
2379   return REG_NOERROR;
2380 }
2381
2382 #if 0
2383 /* Return the next state to which the current state STATE will transit by
2384    accepting the current input byte.  */
2385
2386 static re_dfastate_t *
2387 transit_state_sb (err, mctx, state)
2388      reg_errcode_t *err;
2389      re_match_context_t *mctx;
2390      re_dfastate_t *state;
2391 {
2392   re_dfa_t *const dfa = mctx->dfa;
2393   re_node_set next_nodes;
2394   re_dfastate_t *next_state;
2395   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2396   unsigned int context;
2397
2398   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2399   if (BE (*err != REG_NOERROR, 0))
2400     return NULL;
2401   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2402     {
2403       int cur_node = state->nodes.elems[node_cnt];
2404       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2405         {
2406           *err = re_node_set_merge (&next_nodes,
2407                                     dfa->eclosures + dfa->nexts[cur_node]);
2408           if (BE (*err != REG_NOERROR, 0))
2409             {
2410               re_node_set_free (&next_nodes);
2411               return NULL;
2412             }
2413         }
2414     }
2415   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2416   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2417   /* We don't need to check errors here, since the return value of
2418      this function is next_state and ERR is already set.  */
2419
2420   re_node_set_free (&next_nodes);
2421   re_string_skip_bytes (&mctx->input, 1);
2422   return next_state;
2423 }
2424 #endif
2425
2426 #ifdef RE_ENABLE_I18N
2427 static reg_errcode_t
2428 internal_function
2429 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2430 {
2431   re_dfa_t *const dfa = mctx->dfa;
2432   reg_errcode_t err;
2433   int i;
2434
2435   for (i = 0; i < pstate->nodes.nelem; ++i)
2436     {
2437       re_node_set dest_nodes, *new_nodes;
2438       int cur_node_idx = pstate->nodes.elems[i];
2439       int naccepted, dest_idx;
2440       unsigned int context;
2441       re_dfastate_t *dest_state;
2442
2443       if (!dfa->nodes[cur_node_idx].accept_mb)
2444         continue;
2445
2446       if (dfa->nodes[cur_node_idx].constraint)
2447         {
2448           context = re_string_context_at (&mctx->input,
2449                                           re_string_cur_idx (&mctx->input),
2450                                           mctx->eflags);
2451           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2452                                            context))
2453             continue;
2454         }
2455
2456       /* How many bytes the node can accept?  */
2457       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2458                                            re_string_cur_idx (&mctx->input));
2459       if (naccepted == 0)
2460         continue;
2461
2462       /* The node can accepts `naccepted' bytes.  */
2463       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2464       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2465                                : mctx->max_mb_elem_len);
2466       err = clean_state_log_if_needed (mctx, dest_idx);
2467       if (BE (err != REG_NOERROR, 0))
2468         return err;
2469 #ifdef DEBUG
2470       assert (dfa->nexts[cur_node_idx] != -1);
2471 #endif
2472       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2473
2474       dest_state = mctx->state_log[dest_idx];
2475       if (dest_state == NULL)
2476         dest_nodes = *new_nodes;
2477       else
2478         {
2479           err = re_node_set_init_union (&dest_nodes,
2480                                         dest_state->entrance_nodes, new_nodes);
2481           if (BE (err != REG_NOERROR, 0))
2482             return err;
2483         }
2484       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2485       mctx->state_log[dest_idx]
2486         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2487       if (dest_state != NULL)
2488         re_node_set_free (&dest_nodes);
2489       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2490         return err;
2491     }
2492   return REG_NOERROR;
2493 }
2494 #endif /* RE_ENABLE_I18N */
2495
2496 static reg_errcode_t
2497 internal_function
2498 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2499 {
2500   re_dfa_t *const dfa = mctx->dfa;
2501   reg_errcode_t err;
2502   int i;
2503   int cur_str_idx = re_string_cur_idx (&mctx->input);
2504
2505   for (i = 0; i < nodes->nelem; ++i)
2506     {
2507       int dest_str_idx, prev_nelem, bkc_idx;
2508       int node_idx = nodes->elems[i];
2509       unsigned int context;
2510       const re_token_t *node = dfa->nodes + node_idx;
2511       re_node_set *new_dest_nodes;
2512
2513       /* Check whether `node' is a backreference or not.  */
2514       if (node->type != OP_BACK_REF)
2515         continue;
2516
2517       if (node->constraint)
2518         {
2519           context = re_string_context_at (&mctx->input, cur_str_idx,
2520                                           mctx->eflags);
2521           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2522             continue;
2523         }
2524
2525       /* `node' is a backreference.
2526          Check the substring which the substring matched.  */
2527       bkc_idx = mctx->nbkref_ents;
2528       err = get_subexp (mctx, node_idx, cur_str_idx);
2529       if (BE (err != REG_NOERROR, 0))
2530         goto free_return;
2531
2532       /* And add the epsilon closures (which is `new_dest_nodes') of
2533          the backreference to appropriate state_log.  */
2534 #ifdef DEBUG
2535       assert (dfa->nexts[node_idx] != -1);
2536 #endif
2537       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2538         {
2539           int subexp_len;
2540           re_dfastate_t *dest_state;
2541           struct re_backref_cache_entry *bkref_ent;
2542           bkref_ent = mctx->bkref_ents + bkc_idx;
2543           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2544             continue;
2545           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2546           new_dest_nodes = (subexp_len == 0
2547                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2548                             : dfa->eclosures + dfa->nexts[node_idx]);
2549           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2550                           - bkref_ent->subexp_from);
2551           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2552                                           mctx->eflags);
2553           dest_state = mctx->state_log[dest_str_idx];
2554           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2555                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2556           /* Add `new_dest_node' to state_log.  */
2557           if (dest_state == NULL)
2558             {
2559               mctx->state_log[dest_str_idx]
2560                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2561                                             context);
2562               if (BE (mctx->state_log[dest_str_idx] == NULL
2563                       && err != REG_NOERROR, 0))
2564                 goto free_return;
2565             }
2566           else
2567             {
2568               re_node_set dest_nodes;
2569               err = re_node_set_init_union (&dest_nodes,
2570                                             dest_state->entrance_nodes,
2571                                             new_dest_nodes);
2572               if (BE (err != REG_NOERROR, 0))
2573                 {
2574                   re_node_set_free (&dest_nodes);
2575                   goto free_return;
2576                 }
2577               mctx->state_log[dest_str_idx]
2578                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2579               re_node_set_free (&dest_nodes);
2580               if (BE (mctx->state_log[dest_str_idx] == NULL
2581                       && err != REG_NOERROR, 0))
2582                 goto free_return;
2583             }
2584           /* We need to check recursively if the backreference can epsilon
2585              transit.  */
2586           if (subexp_len == 0
2587               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2588             {
2589               err = check_subexp_matching_top (mctx, new_dest_nodes,
2590                                                cur_str_idx);
2591               if (BE (err != REG_NOERROR, 0))
2592                 goto free_return;
2593               err = transit_state_bkref (mctx, new_dest_nodes);
2594               if (BE (err != REG_NOERROR, 0))
2595                 goto free_return;
2596             }
2597         }
2598     }
2599   err = REG_NOERROR;
2600  free_return:
2601   return err;
2602 }
2603
2604 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2605    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2606    Note that we might collect inappropriate candidates here.
2607    However, the cost of checking them strictly here is too high, then we
2608    delay these checking for prune_impossible_nodes().  */
2609
2610 static reg_errcode_t
2611 internal_function
2612 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2613 {
2614   re_dfa_t *const dfa = mctx->dfa;
2615   int subexp_num, sub_top_idx;
2616   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2617   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2618   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2619   if (cache_idx != -1)
2620     {
2621       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2622       do
2623         if (entry->node == bkref_node)
2624           return REG_NOERROR; /* We already checked it.  */
2625       while (entry++->more);
2626     }
2627
2628   subexp_num = dfa->nodes[bkref_node].opr.idx;
2629
2630   /* For each sub expression  */
2631   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2632     {
2633       reg_errcode_t err;
2634       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2635       re_sub_match_last_t *sub_last;
2636       int sub_last_idx, sl_str, bkref_str_off;
2637
2638       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2639         continue; /* It isn't related.  */
2640
2641       sl_str = sub_top->str_idx;
2642       bkref_str_off = bkref_str_idx;
2643       /* At first, check the last node of sub expressions we already
2644          evaluated.  */
2645       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2646         {
2647           int sl_str_diff;
2648           sub_last = sub_top->lasts[sub_last_idx];
2649           sl_str_diff = sub_last->str_idx - sl_str;
2650           /* The matched string by the sub expression match with the substring
2651              at the back reference?  */
2652           if (sl_str_diff > 0)
2653             {
2654               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2655                 {
2656                   /* Not enough chars for a successful match.  */
2657                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2658                     break;
2659
2660                   err = clean_state_log_if_needed (mctx,
2661                                                    bkref_str_off
2662                                                    + sl_str_diff);
2663                   if (BE (err != REG_NOERROR, 0))
2664                     return err;
2665                   buf = (const char *) re_string_get_buffer (&mctx->input);
2666                 }
2667               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2668                 break; /* We don't need to search this sub expression any more.  */
2669             }
2670           bkref_str_off += sl_str_diff;
2671           sl_str += sl_str_diff;
2672           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2673                                 bkref_str_idx);
2674
2675           /* Reload buf, since the preceding call might have reallocated
2676              the buffer.  */
2677           buf = (const char *) re_string_get_buffer (&mctx->input);
2678
2679           if (err == REG_NOMATCH)
2680             continue;
2681           if (BE (err != REG_NOERROR, 0))
2682             return err;
2683         }
2684
2685       if (sub_last_idx < sub_top->nlasts)
2686         continue;
2687       if (sub_last_idx > 0)
2688         ++sl_str;
2689       /* Then, search for the other last nodes of the sub expression.  */
2690       for (; sl_str <= bkref_str_idx; ++sl_str)
2691         {
2692           int cls_node, sl_str_off;
2693           const re_node_set *nodes;
2694           sl_str_off = sl_str - sub_top->str_idx;
2695           /* The matched string by the sub expression match with the substring
2696              at the back reference?  */
2697           if (sl_str_off > 0)
2698             {
2699               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2700                 {
2701                   /* If we are at the end of the input, we cannot match.  */
2702                   if (bkref_str_off >= mctx->input.len)
2703                     break;
2704
2705                   err = extend_buffers (mctx);
2706                   if (BE (err != REG_NOERROR, 0))
2707                     return err;
2708
2709                   buf = (const char *) re_string_get_buffer (&mctx->input);
2710                 }
2711               if (buf [bkref_str_off++] != buf[sl_str - 1])
2712                 break; /* We don't need to search this sub expression
2713                           any more.  */
2714             }
2715           if (mctx->state_log[sl_str] == NULL)
2716             continue;
2717           /* Does this state have a ')' of the sub expression?  */
2718           nodes = &mctx->state_log[sl_str]->nodes;
2719           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2720           if (cls_node == -1)
2721             continue; /* No.  */
2722           if (sub_top->path == NULL)
2723             {
2724               sub_top->path = calloc (sizeof (state_array_t),
2725                                       sl_str - sub_top->str_idx + 1);
2726               if (sub_top->path == NULL)
2727                 return REG_ESPACE;
2728             }
2729           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2730              in the current context?  */
2731           err = check_arrival (mctx, sub_top->path, sub_top->node,
2732                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2733           if (err == REG_NOMATCH)
2734               continue;
2735           if (BE (err != REG_NOERROR, 0))
2736               return err;
2737           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2738           if (BE (sub_last == NULL, 0))
2739             return REG_ESPACE;
2740           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2741                                 bkref_str_idx);
2742           if (err == REG_NOMATCH)
2743             continue;
2744         }
2745     }
2746   return REG_NOERROR;
2747 }
2748
2749 /* Helper functions for get_subexp().  */
2750
2751 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2752    If it can arrive, register the sub expression expressed with SUB_TOP
2753    and SUB_LAST.  */
2754
2755 static reg_errcode_t
2756 internal_function
2757 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2758                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2759 {
2760   reg_errcode_t err;
2761   int to_idx;
2762   /* Can the subexpression arrive the back reference?  */
2763   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2764                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2765   if (err != REG_NOERROR)
2766     return err;
2767   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2768                              sub_last->str_idx);
2769   if (BE (err != REG_NOERROR, 0))
2770     return err;
2771   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2772   return clean_state_log_if_needed (mctx, to_idx);
2773 }
2774
2775 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2776    Search '(' if FL_OPEN, or search ')' otherwise.
2777    TODO: This function isn't efficient...
2778          Because there might be more than one nodes whose types are
2779          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2780          nodes.
2781          E.g. RE: (a){2}  */
2782
2783 static int
2784 internal_function
2785 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2786                   int subexp_idx, int type)
2787 {
2788   int cls_idx;
2789   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2790     {
2791       int cls_node = nodes->elems[cls_idx];
2792       const re_token_t *node = dfa->nodes + cls_node;
2793       if (node->type == type
2794           && node->opr.idx == subexp_idx)
2795         return cls_node;
2796     }
2797   return -1;
2798 }
2799
2800 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2801    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2802    heavily reused.
2803    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2804
2805 static reg_errcode_t
2806 internal_function
2807 check_arrival (re_match_context_t *mctx, state_array_t *path,
2808                int top_node, int top_str, int last_node, int last_str,
2809                int type)
2810 {
2811   re_dfa_t *const dfa = mctx->dfa;
2812   reg_errcode_t err;
2813   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2814   re_dfastate_t *cur_state = NULL;
2815   re_node_set *cur_nodes, next_nodes;
2816   re_dfastate_t **backup_state_log;
2817   unsigned int context;
2818
2819   subexp_num = dfa->nodes[top_node].opr.idx;
2820   /* Extend the buffer if we need.  */
2821   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2822     {
2823       re_dfastate_t **new_array;
2824       int old_alloc = path->alloc;
2825       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2826       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2827       if (new_array == NULL)
2828         {
2829           path->alloc = old_alloc;
2830           return REG_ESPACE;
2831         }
2832       path->array = new_array;
2833       memset (new_array + old_alloc, '\0',
2834               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2835     }
2836
2837   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2838
2839   /* Temporary modify MCTX.  */
2840   backup_state_log = mctx->state_log;
2841   backup_cur_idx = mctx->input.cur_idx;
2842   mctx->state_log = path->array;
2843   mctx->input.cur_idx = str_idx;
2844
2845   /* Setup initial node set.  */
2846   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2847   if (str_idx == top_str)
2848     {
2849       err = re_node_set_init_1 (&next_nodes, top_node);
2850       if (BE (err != REG_NOERROR, 0))
2851         return err;
2852       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2853       if (BE (err != REG_NOERROR, 0))
2854         {
2855           re_node_set_free (&next_nodes);
2856           return err;
2857         }
2858     }
2859   else
2860     {
2861       cur_state = mctx->state_log[str_idx];
2862       if (cur_state && cur_state->has_backref)
2863         {
2864           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2865           if (BE ( err != REG_NOERROR, 0))
2866             return err;
2867         }
2868       else
2869         re_node_set_init_empty (&next_nodes);
2870     }
2871   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2872     {
2873       if (next_nodes.nelem)
2874         {
2875           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2876                                     subexp_num, type);
2877           if (BE ( err != REG_NOERROR, 0))
2878             {
2879               re_node_set_free (&next_nodes);
2880               return err;
2881             }
2882         }
2883       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2884       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2885         {
2886           re_node_set_free (&next_nodes);
2887           return err;
2888         }
2889       mctx->state_log[str_idx] = cur_state;
2890     }
2891
2892   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2893     {
2894       re_node_set_empty (&next_nodes);
2895       if (mctx->state_log[str_idx + 1])
2896         {
2897           err = re_node_set_merge (&next_nodes,
2898                                    &mctx->state_log[str_idx + 1]->nodes);
2899           if (BE (err != REG_NOERROR, 0))
2900             {
2901               re_node_set_free (&next_nodes);
2902               return err;
2903             }
2904         }
2905       if (cur_state)
2906         {
2907           err = check_arrival_add_next_nodes (mctx, str_idx,
2908                                               &cur_state->non_eps_nodes, &next_nodes);
2909           if (BE (err != REG_NOERROR, 0))
2910             {
2911               re_node_set_free (&next_nodes);
2912               return err;
2913             }
2914         }
2915       ++str_idx;
2916       if (next_nodes.nelem)
2917         {
2918           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2919           if (BE (err != REG_NOERROR, 0))
2920             {
2921               re_node_set_free (&next_nodes);
2922               return err;
2923             }
2924           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2925                                     subexp_num, type);
2926           if (BE ( err != REG_NOERROR, 0))
2927             {
2928               re_node_set_free (&next_nodes);
2929               return err;
2930             }
2931         }
2932       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2933       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2934       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2935         {
2936           re_node_set_free (&next_nodes);
2937           return err;
2938         }
2939       mctx->state_log[str_idx] = cur_state;
2940       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2941     }
2942   re_node_set_free (&next_nodes);
2943   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2944                : &mctx->state_log[last_str]->nodes);
2945   path->next_idx = str_idx;
2946
2947   /* Fix MCTX.  */
2948   mctx->state_log = backup_state_log;
2949   mctx->input.cur_idx = backup_cur_idx;
2950
2951   /* Then check the current node set has the node LAST_NODE.  */
2952   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2953     return REG_NOERROR;
2954
2955   return REG_NOMATCH;
2956 }
2957
2958 /* Helper functions for check_arrival.  */
2959
2960 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2961    to NEXT_NODES.
2962    TODO: This function is similar to the functions transit_state*(),
2963          however this function has many additional works.
2964          Can't we unify them?  */
2965
2966 static reg_errcode_t
2967 internal_function
2968 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2969                               re_node_set *cur_nodes,
2970                               re_node_set *next_nodes)
2971 {
2972   re_dfa_t *const dfa = mctx->dfa;
2973   int result;
2974   int cur_idx;
2975   reg_errcode_t err;
2976   re_node_set union_set;
2977   re_node_set_init_empty (&union_set);
2978   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2979     {
2980       int naccepted = 0;
2981       int cur_node = cur_nodes->elems[cur_idx];
2982 #ifdef DEBUG
2983       re_token_type_t type = dfa->nodes[cur_node].type;
2984       assert (!IS_EPSILON_NODE (type));
2985 #endif
2986 #ifdef RE_ENABLE_I18N
2987       /* If the node may accept `multi byte'.  */
2988       if (dfa->nodes[cur_node].accept_mb)
2989         {
2990           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2991                                                str_idx);
2992           if (naccepted > 1)
2993             {
2994               re_dfastate_t *dest_state;
2995               int next_node = dfa->nexts[cur_node];
2996               int next_idx = str_idx + naccepted;
2997               dest_state = mctx->state_log[next_idx];
2998               re_node_set_empty (&union_set);
2999               if (dest_state)
3000                 {
3001                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3002                   if (BE (err != REG_NOERROR, 0))
3003                     {
3004                       re_node_set_free (&union_set);
3005                       return err;
3006                     }
3007                 }
3008               result = re_node_set_insert (&union_set, next_node);
3009               if (BE (result < 0, 0))
3010                 {
3011                   re_node_set_free (&union_set);
3012                   return REG_ESPACE;
3013                 }
3014               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3015                                                             &union_set);
3016               if (BE (mctx->state_log[next_idx] == NULL
3017                       && err != REG_NOERROR, 0))
3018                 {
3019                   re_node_set_free (&union_set);
3020                   return err;
3021                 }
3022             }
3023         }
3024 #endif /* RE_ENABLE_I18N */
3025       if (naccepted
3026           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3027         {
3028           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3029           if (BE (result < 0, 0))
3030             {
3031               re_node_set_free (&union_set);
3032               return REG_ESPACE;
3033             }
3034         }
3035     }
3036   re_node_set_free (&union_set);
3037   return REG_NOERROR;
3038 }
3039
3040 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3041    CUR_NODES, however exclude the nodes which are:
3042     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3043     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3044 */
3045
3046 static reg_errcode_t
3047 internal_function
3048 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3049                           int ex_subexp, int type)
3050 {
3051   reg_errcode_t err;
3052   int idx, outside_node;
3053   re_node_set new_nodes;
3054 #ifdef DEBUG
3055   assert (cur_nodes->nelem);
3056 #endif
3057   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3058   if (BE (err != REG_NOERROR, 0))
3059     return err;
3060   /* Create a new node set NEW_NODES with the nodes which are epsilon
3061      closures of the node in CUR_NODES.  */
3062
3063   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3064     {
3065       int cur_node = cur_nodes->elems[idx];
3066       re_node_set *eclosure = dfa->eclosures + cur_node;
3067       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3068       if (outside_node == -1)
3069         {
3070           /* There are no problematic nodes, just merge them.  */
3071           err = re_node_set_merge (&new_nodes, eclosure);
3072           if (BE (err != REG_NOERROR, 0))
3073             {
3074               re_node_set_free (&new_nodes);
3075               return err;
3076             }
3077         }
3078       else
3079         {
3080           /* There are problematic nodes, re-calculate incrementally.  */
3081           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3082                                               ex_subexp, type);
3083           if (BE (err != REG_NOERROR, 0))
3084             {
3085               re_node_set_free (&new_nodes);
3086               return err;
3087             }
3088         }
3089     }
3090   re_node_set_free (cur_nodes);
3091   *cur_nodes = new_nodes;
3092   return REG_NOERROR;
3093 }
3094
3095 /* Helper function for check_arrival_expand_ecl.
3096    Check incrementally the epsilon closure of TARGET, and if it isn't
3097    problematic append it to DST_NODES.  */
3098
3099 static reg_errcode_t
3100 internal_function
3101 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3102                               int target, int ex_subexp, int type)
3103 {
3104   int cur_node;
3105   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3106     {
3107       int err;
3108
3109       if (dfa->nodes[cur_node].type == type
3110           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3111         {
3112           if (type == OP_CLOSE_SUBEXP)
3113             {
3114               err = re_node_set_insert (dst_nodes, cur_node);
3115               if (BE (err == -1, 0))
3116                 return REG_ESPACE;
3117             }
3118           break;
3119         }
3120       err = re_node_set_insert (dst_nodes, cur_node);
3121       if (BE (err == -1, 0))
3122         return REG_ESPACE;
3123       if (dfa->edests[cur_node].nelem == 0)
3124         break;
3125       if (dfa->edests[cur_node].nelem == 2)
3126         {
3127           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3128                                               dfa->edests[cur_node].elems[1],
3129                                               ex_subexp, type);
3130           if (BE (err != REG_NOERROR, 0))
3131             return err;
3132         }
3133       cur_node = dfa->edests[cur_node].elems[0];
3134     }
3135   return REG_NOERROR;
3136 }
3137
3138
3139 /* For all the back references in the current state, calculate the
3140    destination of the back references by the appropriate entry
3141    in MCTX->BKREF_ENTS.  */
3142
3143 static reg_errcode_t
3144 internal_function
3145 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3146                     int cur_str, int subexp_num, int type)
3147 {
3148   re_dfa_t *const dfa = mctx->dfa;
3149   reg_errcode_t err;
3150   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3151   struct re_backref_cache_entry *ent;
3152
3153   if (cache_idx_start == -1)
3154     return REG_NOERROR;
3155
3156  restart:
3157   ent = mctx->bkref_ents + cache_idx_start;
3158   do
3159     {
3160       int to_idx, next_node;
3161
3162       /* Is this entry ENT is appropriate?  */
3163       if (!re_node_set_contains (cur_nodes, ent->node))
3164         continue; /* No.  */
3165
3166       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3167       /* Calculate the destination of the back reference, and append it
3168          to MCTX->STATE_LOG.  */
3169       if (to_idx == cur_str)
3170         {
3171           /* The backreference did epsilon transit, we must re-check all the
3172              node in the current state.  */
3173           re_node_set new_dests;
3174           reg_errcode_t err2, err3;
3175           next_node = dfa->edests[ent->node].elems[0];
3176           if (re_node_set_contains (cur_nodes, next_node))
3177             continue;
3178           err = re_node_set_init_1 (&new_dests, next_node);
3179           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3180           err3 = re_node_set_merge (cur_nodes, &new_dests);
3181           re_node_set_free (&new_dests);
3182           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3183                   || err3 != REG_NOERROR, 0))
3184             {
3185               err = (err != REG_NOERROR ? err
3186                      : (err2 != REG_NOERROR ? err2 : err3));
3187               return err;
3188             }
3189           /* TODO: It is still inefficient...  */
3190           goto restart;
3191         }
3192       else
3193         {
3194           re_node_set union_set;
3195           next_node = dfa->nexts[ent->node];
3196           if (mctx->state_log[to_idx])
3197             {
3198               int ret;
3199               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3200                                         next_node))
3201                 continue;
3202               err = re_node_set_init_copy (&union_set,
3203                                            &mctx->state_log[to_idx]->nodes);
3204               ret = re_node_set_insert (&union_set, next_node);
3205               if (BE (err != REG_NOERROR || ret < 0, 0))
3206                 {
3207                   re_node_set_free (&union_set);
3208                   err = err != REG_NOERROR ? err : REG_ESPACE;
3209                   return err;
3210                 }
3211             }
3212           else
3213             {
3214               err = re_node_set_init_1 (&union_set, next_node);
3215               if (BE (err != REG_NOERROR, 0))
3216                 return err;
3217             }
3218           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3219           re_node_set_free (&union_set);
3220           if (BE (mctx->state_log[to_idx] == NULL
3221                   && err != REG_NOERROR, 0))
3222             return err;
3223         }
3224     }
3225   while (ent++->more);
3226   return REG_NOERROR;
3227 }
3228
3229 /* Build transition table for the state.
3230    Return 1 if succeeded, otherwise return NULL.  */
3231
3232 static int
3233 internal_function
3234 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3235 {
3236   reg_errcode_t err;
3237   int i, j, ch, need_word_trtable = 0;
3238   unsigned int elem, mask;
3239   int dests_node_malloced = 0, dest_states_malloced = 0;
3240   int ndests; /* Number of the destination states from `state'.  */
3241   re_dfastate_t **trtable;
3242   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3243   re_node_set follows, *dests_node;
3244   bitset *dests_ch;
3245   bitset acceptable;
3246
3247   /* We build DFA states which corresponds to the destination nodes
3248      from `state'.  `dests_node[i]' represents the nodes which i-th
3249      destination state contains, and `dests_ch[i]' represents the
3250      characters which i-th destination state accepts.  */
3251 #ifdef _LIBC
3252   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3253     dests_node = (re_node_set *)
3254       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3255   else
3256 #endif
3257     {
3258       dests_node = (re_node_set *)
3259         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3260       if (BE (dests_node == NULL, 0))
3261         return 0;
3262       dests_node_malloced = 1;
3263     }
3264   dests_ch = (bitset *) (dests_node + SBC_MAX);
3265
3266   /* Initialize transiton table.  */
3267   state->word_trtable = state->trtable = NULL;
3268
3269   /* At first, group all nodes belonging to `state' into several
3270      destinations.  */
3271   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3272   if (BE (ndests <= 0, 0))
3273     {
3274       if (dests_node_malloced)
3275         free (dests_node);
3276       /* Return 0 in case of an error, 1 otherwise.  */
3277       if (ndests == 0)
3278         {
3279           state->trtable = (re_dfastate_t **)
3280             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3281           return 1;
3282         }
3283       return 0;
3284     }
3285
3286   err = re_node_set_alloc (&follows, ndests + 1);
3287   if (BE (err != REG_NOERROR, 0))
3288     goto out_free;
3289
3290 #ifdef _LIBC
3291   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3292                          + ndests * 3 * sizeof (re_dfastate_t *)))
3293     dest_states = (re_dfastate_t **)
3294       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3295   else
3296 #endif
3297     {
3298       dest_states = (re_dfastate_t **)
3299         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3300       if (BE (dest_states == NULL, 0))
3301         {
3302 out_free:
3303           if (dest_states_malloced)
3304             free (dest_states);
3305           re_node_set_free (&follows);
3306           for (i = 0; i < ndests; ++i)
3307             re_node_set_free (dests_node + i);
3308           if (dests_node_malloced)
3309             free (dests_node);
3310           return 0;
3311         }
3312       dest_states_malloced = 1;
3313     }
3314   dest_states_word = dest_states + ndests;
3315   dest_states_nl = dest_states_word + ndests;
3316   bitset_empty (acceptable);
3317
3318   /* Then build the states for all destinations.  */
3319   for (i = 0; i < ndests; ++i)
3320     {
3321       int next_node;
3322       re_node_set_empty (&follows);
3323       /* Merge the follows of this destination states.  */
3324       for (j = 0; j < dests_node[i].nelem; ++j)
3325         {
3326           next_node = dfa->nexts[dests_node[i].elems[j]];
3327           if (next_node != -1)
3328             {
3329               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3330               if (BE (err != REG_NOERROR, 0))
3331                 goto out_free;
3332             }
3333         }
3334       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3335       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3336         goto out_free;
3337       /* If the new state has context constraint,
3338          build appropriate states for these contexts.  */
3339       if (dest_states[i]->has_constraint)
3340         {
3341           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3342                                                           CONTEXT_WORD);
3343           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3344             goto out_free;
3345
3346           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3347             need_word_trtable = 1;
3348
3349           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3350                                                         CONTEXT_NEWLINE);
3351           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3352             goto out_free;
3353         }
3354       else
3355         {
3356           dest_states_word[i] = dest_states[i];
3357           dest_states_nl[i] = dest_states[i];
3358         }
3359       bitset_merge (acceptable, dests_ch[i]);
3360     }
3361
3362   if (!BE (need_word_trtable, 0))
3363     {
3364       /* We don't care about whether the following character is a word
3365          character, or we are in a single-byte character set so we can
3366          discern by looking at the character code: allocate a
3367          256-entry transition table.  */
3368       trtable = state->trtable =
3369         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3370       if (BE (trtable == NULL, 0))
3371         goto out_free;
3372
3373       /* For all characters ch...:  */
3374       for (i = 0; i < BITSET_UINTS; ++i)
3375         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3376              elem;
3377              mask <<= 1, elem >>= 1, ++ch)
3378           if (BE (elem & 1, 0))
3379             {
3380               /* There must be exactly one destination which accepts
3381                  character ch.  See group_nodes_into_DFAstates.  */
3382               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3383                 ;
3384
3385               /* j-th destination accepts the word character ch.  */
3386               if (dfa->word_char[i] & mask)
3387                 trtable[ch] = dest_states_word[j];
3388               else
3389                 trtable[ch] = dest_states[j];
3390             }
3391     }
3392   else
3393     {
3394       /* We care about whether the following character is a word
3395          character, and we are in a multi-byte character set: discern
3396          by looking at the character code: build two 256-entry
3397          transition tables, one starting at trtable[0] and one
3398          starting at trtable[SBC_MAX].  */
3399       trtable = state->word_trtable =
3400         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3401       if (BE (trtable == NULL, 0))
3402         goto out_free;
3403
3404       /* For all characters ch...:  */
3405       for (i = 0; i < BITSET_UINTS; ++i)
3406         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3407              elem;
3408              mask <<= 1, elem >>= 1, ++ch)
3409           if (BE (elem & 1, 0))
3410             {
3411               /* There must be exactly one destination which accepts
3412                  character ch.  See group_nodes_into_DFAstates.  */
3413               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3414                 ;
3415
3416               /* j-th destination accepts the word character ch.  */
3417               trtable[ch] = dest_states[j];
3418               trtable[ch + SBC_MAX] = dest_states_word[j];
3419             }
3420     }
3421
3422   /* new line */
3423   if (bitset_contain (acceptable, NEWLINE_CHAR))
3424     {
3425       /* The current state accepts newline character.  */
3426       for (j = 0; j < ndests; ++j)
3427         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3428           {
3429             /* k-th destination accepts newline character.  */
3430             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3431             if (need_word_trtable)
3432               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3433             /* There must be only one destination which accepts
3434                newline.  See group_nodes_into_DFAstates.  */
3435             break;
3436           }
3437     }
3438
3439   if (dest_states_malloced)
3440     free (dest_states);
3441
3442   re_node_set_free (&follows);
3443   for (i = 0; i < ndests; ++i)
3444     re_node_set_free (dests_node + i);
3445
3446   if (dests_node_malloced)
3447     free (dests_node);
3448
3449   return 1;
3450 }
3451
3452 /* Group all nodes belonging to STATE into several destinations.
3453    Then for all destinations, set the nodes belonging to the destination
3454    to DESTS_NODE[i] and set the characters accepted by the destination
3455    to DEST_CH[i].  This function return the number of destinations.  */
3456
3457 static int
3458 internal_function
3459 group_nodes_into_DFAstates (re_dfa_t *dfa, const re_dfastate_t *state,
3460                             re_node_set *dests_node, bitset *dests_ch)
3461 {
3462   reg_errcode_t err;
3463   int result;
3464   int i, j, k;
3465   int ndests; /* Number of the destinations from `state'.  */
3466   bitset accepts; /* Characters a node can accept.  */
3467   const re_node_set *cur_nodes = &state->nodes;
3468   bitset_empty (accepts);
3469   ndests = 0;
3470
3471   /* For all the nodes belonging to `state',  */
3472   for (i = 0; i < cur_nodes->nelem; ++i)
3473     {
3474       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3475       re_token_type_t type = node->type;
3476       unsigned int constraint = node->constraint;
3477
3478       /* Enumerate all single byte character this node can accept.  */
3479       if (type == CHARACTER)
3480         bitset_set (accepts, node->opr.c);
3481       else if (type == SIMPLE_BRACKET)
3482         {
3483           bitset_merge (accepts, node->opr.sbcset);
3484         }
3485       else if (type == OP_PERIOD)
3486         {
3487 #ifdef RE_ENABLE_I18N
3488           if (dfa->mb_cur_max > 1)
3489             bitset_merge (accepts, dfa->sb_char);
3490           else
3491 #endif
3492             bitset_set_all (accepts);
3493           if (!(dfa->syntax & RE_DOT_NEWLINE))
3494             bitset_clear (accepts, '\n');
3495           if (dfa->syntax & RE_DOT_NOT_NULL)
3496             bitset_clear (accepts, '\0');
3497         }
3498 #ifdef RE_ENABLE_I18N
3499       else if (type == OP_UTF8_PERIOD)
3500         {
3501           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3502           if (!(dfa->syntax & RE_DOT_NEWLINE))
3503             bitset_clear (accepts, '\n');
3504           if (dfa->syntax & RE_DOT_NOT_NULL)
3505             bitset_clear (accepts, '\0');
3506         }
3507 #endif
3508       else
3509         continue;
3510
3511       /* Check the `accepts' and sift the characters which are not
3512          match it the context.  */
3513       if (constraint)
3514         {
3515           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3516             {
3517               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3518               bitset_empty (accepts);
3519               if (accepts_newline)
3520                 bitset_set (accepts, NEWLINE_CHAR);
3521               else
3522                 continue;
3523             }
3524           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3525             {
3526               bitset_empty (accepts);
3527               continue;
3528             }
3529
3530           if (constraint & NEXT_WORD_CONSTRAINT)
3531             {
3532               unsigned int any_set = 0;
3533               if (type == CHARACTER && !node->word_char)
3534                 {
3535                   bitset_empty (accepts);
3536                   continue;
3537                 }
3538 #ifdef RE_ENABLE_I18N
3539               if (dfa->mb_cur_max > 1)
3540                 for (j = 0; j < BITSET_UINTS; ++j)
3541                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3542               else
3543 #endif
3544                 for (j = 0; j < BITSET_UINTS; ++j)
3545                   any_set |= (accepts[j] &= dfa->word_char[j]);
3546               if (!any_set)
3547                 continue;
3548             }
3549           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3550             {
3551               unsigned int any_set = 0;
3552               if (type == CHARACTER && node->word_char)
3553                 {
3554                   bitset_empty (accepts);
3555                   continue;
3556                 }
3557 #ifdef RE_ENABLE_I18N
3558               if (dfa->mb_cur_max > 1)
3559                 for (j = 0; j < BITSET_UINTS; ++j)
3560                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3561               else
3562 #endif
3563                 for (j = 0; j < BITSET_UINTS; ++j)
3564                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3565               if (!any_set)
3566                 continue;
3567             }
3568         }
3569
3570       /* Then divide `accepts' into DFA states, or create a new
3571          state.  Above, we make sure that accepts is not empty.  */
3572       for (j = 0; j < ndests; ++j)
3573         {
3574           bitset intersec; /* Intersection sets, see below.  */
3575           bitset remains;
3576           /* Flags, see below.  */
3577           int has_intersec, not_subset, not_consumed;
3578
3579           /* Optimization, skip if this state doesn't accept the character.  */
3580           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3581             continue;
3582
3583           /* Enumerate the intersection set of this state and `accepts'.  */
3584           has_intersec = 0;
3585           for (k = 0; k < BITSET_UINTS; ++k)
3586             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3587           /* And skip if the intersection set is empty.  */
3588           if (!has_intersec)
3589             continue;
3590
3591           /* Then check if this state is a subset of `accepts'.  */
3592           not_subset = not_consumed = 0;
3593           for (k = 0; k < BITSET_UINTS; ++k)
3594             {
3595               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3596               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3597             }
3598
3599           /* If this state isn't a subset of `accepts', create a
3600              new group state, which has the `remains'. */
3601           if (not_subset)
3602             {
3603               bitset_copy (dests_ch[ndests], remains);
3604               bitset_copy (dests_ch[j], intersec);
3605               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3606               if (BE (err != REG_NOERROR, 0))
3607                 goto error_return;
3608               ++ndests;
3609             }
3610
3611           /* Put the position in the current group. */
3612           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3613           if (BE (result < 0, 0))
3614             goto error_return;
3615
3616           /* If all characters are consumed, go to next node. */
3617           if (!not_consumed)
3618             break;
3619         }
3620       /* Some characters remain, create a new group. */
3621       if (j == ndests)
3622         {
3623           bitset_copy (dests_ch[ndests], accepts);
3624           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3625           if (BE (err != REG_NOERROR, 0))
3626             goto error_return;
3627           ++ndests;
3628           bitset_empty (accepts);
3629         }
3630     }
3631   return ndests;
3632  error_return:
3633   for (j = 0; j < ndests; ++j)
3634     re_node_set_free (dests_node + j);
3635   return -1;
3636 }
3637
3638 #ifdef RE_ENABLE_I18N
3639 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3640    Return the number of the bytes the node accepts.
3641    STR_IDX is the current index of the input string.
3642
3643    This function handles the nodes which can accept one character, or
3644    one collating element like '.', '[a-z]', opposite to the other nodes
3645    can only accept one byte.  */
3646
3647 static int
3648 internal_function
3649 check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3650                          const re_string_t *input, int str_idx)
3651 {
3652   const re_token_t *node = dfa->nodes + node_idx;
3653   int char_len, elem_len;
3654   int i;
3655
3656   if (BE (node->type == OP_UTF8_PERIOD, 0))
3657     {
3658       unsigned char c = re_string_byte_at (input, str_idx), d;
3659       if (BE (c < 0xc2, 1))
3660         return 0;
3661
3662       if (str_idx + 2 > input->len)
3663         return 0;
3664
3665       d = re_string_byte_at (input, str_idx + 1);
3666       if (c < 0xe0)
3667         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3668       else if (c < 0xf0)
3669         {
3670           char_len = 3;
3671           if (c == 0xe0 && d < 0xa0)
3672             return 0;
3673         }
3674       else if (c < 0xf8)
3675         {
3676           char_len = 4;
3677           if (c == 0xf0 && d < 0x90)
3678             return 0;
3679         }
3680       else if (c < 0xfc)
3681         {
3682           char_len = 5;
3683           if (c == 0xf8 && d < 0x88)
3684             return 0;
3685         }
3686       else if (c < 0xfe)
3687         {
3688           char_len = 6;
3689           if (c == 0xfc && d < 0x84)
3690             return 0;
3691         }
3692       else
3693         return 0;
3694
3695       if (str_idx + char_len > input->len)
3696         return 0;
3697
3698       for (i = 1; i < char_len; ++i)
3699         {
3700           d = re_string_byte_at (input, str_idx + i);
3701           if (d < 0x80 || d > 0xbf)
3702             return 0;
3703         }
3704       return char_len;
3705     }
3706
3707   char_len = re_string_char_size_at (input, str_idx);
3708   if (node->type == OP_PERIOD)
3709     {
3710       if (char_len <= 1)
3711         return 0;
3712       /* FIXME: I don't think this if is needed, as both '\n'
3713          and '\0' are char_len == 1.  */
3714       /* '.' accepts any one character except the following two cases.  */
3715       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3716            re_string_byte_at (input, str_idx) == '\n') ||
3717           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3718            re_string_byte_at (input, str_idx) == '\0'))
3719         return 0;
3720       return char_len;
3721     }
3722
3723   elem_len = re_string_elem_size_at (input, str_idx);
3724   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3725     return 0;
3726
3727   if (node->type == COMPLEX_BRACKET)
3728     {
3729       const re_charset_t *cset = node->opr.mbcset;
3730 # ifdef _LIBC
3731       const unsigned char *pin
3732         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3733       int j;
3734       uint32_t nrules;
3735 # endif /* _LIBC */
3736       int match_len = 0;
3737       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3738                     ? re_string_wchar_at (input, str_idx) : 0);
3739
3740       /* match with multibyte character?  */
3741       for (i = 0; i < cset->nmbchars; ++i)
3742         if (wc == cset->mbchars[i])
3743           {
3744             match_len = char_len;
3745             goto check_node_accept_bytes_match;
3746           }
3747       /* match with character_class?  */
3748       for (i = 0; i < cset->nchar_classes; ++i)
3749         {
3750           wctype_t wt = cset->char_classes[i];
3751           if (__iswctype (wc, wt))
3752             {
3753               match_len = char_len;
3754               goto check_node_accept_bytes_match;
3755             }
3756         }
3757
3758 # ifdef _LIBC
3759       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3760       if (nrules != 0)
3761         {
3762           unsigned int in_collseq = 0;
3763           const int32_t *table, *indirect;
3764           const unsigned char *weights, *extra;
3765           const char *collseqwc;
3766           int32_t idx;
3767           /* This #include defines a local function!  */
3768 #  include <locale/weight.h>
3769
3770           /* match with collating_symbol?  */
3771           if (cset->ncoll_syms)
3772             extra = (const unsigned char *)
3773               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3774           for (i = 0; i < cset->ncoll_syms; ++i)
3775             {
3776               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3777               /* Compare the length of input collating element and
3778                  the length of current collating element.  */
3779               if (*coll_sym != elem_len)
3780                 continue;
3781               /* Compare each bytes.  */
3782               for (j = 0; j < *coll_sym; j++)
3783                 if (pin[j] != coll_sym[1 + j])
3784                   break;
3785               if (j == *coll_sym)
3786                 {
3787                   /* Match if every bytes is equal.  */
3788                   match_len = j;
3789                   goto check_node_accept_bytes_match;
3790                 }
3791             }
3792
3793           if (cset->nranges)
3794             {
3795               if (elem_len <= char_len)
3796                 {
3797                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3798                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3799                 }
3800               else
3801                 in_collseq = find_collation_sequence_value (pin, elem_len);
3802             }
3803           /* match with range expression?  */
3804           for (i = 0; i < cset->nranges; ++i)
3805             if (cset->range_starts[i] <= in_collseq
3806                 && in_collseq <= cset->range_ends[i])
3807               {
3808                 match_len = elem_len;
3809                 goto check_node_accept_bytes_match;
3810               }
3811
3812           /* match with equivalence_class?  */
3813           if (cset->nequiv_classes)
3814             {
3815               const unsigned char *cp = pin;
3816               table = (const int32_t *)
3817                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3818               weights = (const unsigned char *)
3819                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3820               extra = (const unsigned char *)
3821                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3822               indirect = (const int32_t *)
3823                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3824               idx = findidx (&cp);
3825               if (idx > 0)
3826                 for (i = 0; i < cset->nequiv_classes; ++i)
3827                   {
3828                     int32_t equiv_class_idx = cset->equiv_classes[i];
3829                     size_t weight_len = weights[idx];
3830                     if (weight_len == weights[equiv_class_idx])
3831                       {
3832                         int cnt = 0;
3833                         while (cnt <= weight_len
3834                                && (weights[equiv_class_idx + 1 + cnt]
3835                                    == weights[idx + 1 + cnt]))
3836                           ++cnt;
3837                         if (cnt > weight_len)
3838                           {
3839                             match_len = elem_len;
3840                             goto check_node_accept_bytes_match;
3841                           }
3842                       }
3843                   }
3844             }
3845         }
3846       else
3847 # endif /* _LIBC */
3848         {
3849           /* match with range expression?  */
3850 #if __GNUC__ >= 2
3851           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3852 #else
3853           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3854           cmp_buf[2] = wc;
3855 #endif
3856           for (i = 0; i < cset->nranges; ++i)
3857             {
3858               cmp_buf[0] = cset->range_starts[i];
3859               cmp_buf[4] = cset->range_ends[i];
3860               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3861                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3862                 {
3863                   match_len = char_len;
3864                   goto check_node_accept_bytes_match;
3865                 }
3866             }
3867         }
3868     check_node_accept_bytes_match:
3869       if (!cset->non_match)
3870         return match_len;
3871       else
3872         {
3873           if (match_len > 0)
3874             return 0;
3875           else
3876             return (elem_len > char_len) ? elem_len : char_len;
3877         }
3878     }
3879   return 0;
3880 }
3881
3882 # ifdef _LIBC
3883 static unsigned int
3884 find_collation_sequence_value (mbs, mbs_len)
3885     const unsigned char *mbs;
3886     size_t mbs_len;
3887 {
3888   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3889   if (nrules == 0)
3890     {
3891       if (mbs_len == 1)
3892         {
3893           /* No valid character.  Match it as a single byte character.  */
3894           const unsigned char *collseq = (const unsigned char *)
3895             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3896           return collseq[mbs[0]];
3897         }
3898       return UINT_MAX;
3899     }
3900   else
3901     {
3902       int32_t idx;
3903       const unsigned char *extra = (const unsigned char *)
3904         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3905       int32_t extrasize = (const unsigned char *)
3906         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3907
3908       for (idx = 0; idx < extrasize;)
3909         {
3910           int mbs_cnt, found = 0;
3911           int32_t elem_mbs_len;
3912           /* Skip the name of collating element name.  */
3913           idx = idx + extra[idx] + 1;
3914           elem_mbs_len = extra[idx++];
3915           if (mbs_len == elem_mbs_len)
3916             {
3917               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3918                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3919                   break;
3920               if (mbs_cnt == elem_mbs_len)
3921                 /* Found the entry.  */
3922                 found = 1;
3923             }
3924           /* Skip the byte sequence of the collating element.  */
3925           idx += elem_mbs_len;
3926           /* Adjust for the alignment.  */
3927           idx = (idx + 3) & ~3;
3928           /* Skip the collation sequence value.  */
3929           idx += sizeof (uint32_t);
3930           /* Skip the wide char sequence of the collating element.  */
3931           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3932           /* If we found the entry, return the sequence value.  */
3933           if (found)
3934             return *(uint32_t *) (extra + idx);
3935           /* Skip the collation sequence value.  */
3936           idx += sizeof (uint32_t);
3937         }
3938       return UINT_MAX;
3939     }
3940 }
3941 # endif /* _LIBC */
3942 #endif /* RE_ENABLE_I18N */
3943
3944 /* Check whether the node accepts the byte which is IDX-th
3945    byte of the INPUT.  */
3946
3947 static int
3948 internal_function
3949 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3950                    int idx)
3951 {
3952   unsigned char ch;
3953   ch = re_string_byte_at (&mctx->input, idx);
3954   switch (node->type)
3955     {
3956     case CHARACTER:
3957       if (node->opr.c != ch)
3958         return 0;
3959       break;
3960
3961     case SIMPLE_BRACKET:
3962       if (!bitset_contain (node->opr.sbcset, ch))
3963         return 0;
3964       break;
3965
3966 #ifdef RE_ENABLE_I18N
3967     case OP_UTF8_PERIOD:
3968       if (ch >= 0x80)
3969         return 0;
3970       /* FALLTHROUGH */
3971 #endif
3972     case OP_PERIOD:
3973       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3974           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3975         return 0;
3976       break;
3977
3978     default:
3979       return 0;
3980     }
3981
3982   if (node->constraint)
3983     {
3984       /* The node has constraints.  Check whether the current context
3985          satisfies the constraints.  */
3986       unsigned int context = re_string_context_at (&mctx->input, idx,
3987                                                    mctx->eflags);
3988       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3989         return 0;
3990     }
3991
3992   return 1;
3993 }
3994
3995 /* Extend the buffers, if the buffers have run out.  */
3996
3997 static reg_errcode_t
3998 internal_function
3999 extend_buffers (re_match_context_t *mctx)
4000 {
4001   reg_errcode_t ret;
4002   re_string_t *pstr = &mctx->input;
4003
4004   /* Double the lengthes of the buffers.  */
4005   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4006   if (BE (ret != REG_NOERROR, 0))
4007     return ret;
4008
4009   if (mctx->state_log != NULL)
4010     {
4011       /* And double the length of state_log.  */
4012       /* XXX We have no indication of the size of this buffer.  If this
4013          allocation fail we have no indication that the state_log array
4014          does not have the right size.  */
4015       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4016                                               pstr->bufs_len + 1);
4017       if (BE (new_array == NULL, 0))
4018         return REG_ESPACE;
4019       mctx->state_log = new_array;
4020     }
4021
4022   /* Then reconstruct the buffers.  */
4023   if (pstr->icase)
4024     {
4025 #ifdef RE_ENABLE_I18N
4026       if (pstr->mb_cur_max > 1)
4027         {
4028           ret = build_wcs_upper_buffer (pstr);
4029           if (BE (ret != REG_NOERROR, 0))
4030             return ret;
4031         }
4032       else
4033 #endif /* RE_ENABLE_I18N  */
4034         build_upper_buffer (pstr);
4035     }
4036   else
4037     {
4038 #ifdef RE_ENABLE_I18N
4039       if (pstr->mb_cur_max > 1)
4040         build_wcs_buffer (pstr);
4041       else
4042 #endif /* RE_ENABLE_I18N  */
4043         {
4044           if (pstr->trans != NULL)
4045             re_string_translate_buffer (pstr);
4046         }
4047     }
4048   return REG_NOERROR;
4049 }
4050
4051 \f
4052 /* Functions for matching context.  */
4053
4054 /* Initialize MCTX.  */
4055
4056 static reg_errcode_t
4057 internal_function
4058 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4059 {
4060   mctx->eflags = eflags;
4061   mctx->match_last = -1;
4062   if (n > 0)
4063     {
4064       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4065       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4066       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4067         return REG_ESPACE;
4068     }
4069   /* Already zero-ed by the caller.
4070      else
4071        mctx->bkref_ents = NULL;
4072      mctx->nbkref_ents = 0;
4073      mctx->nsub_tops = 0;  */
4074   mctx->abkref_ents = n;
4075   mctx->max_mb_elem_len = 1;
4076   mctx->asub_tops = n;
4077   return REG_NOERROR;
4078 }
4079
4080 /* Clean the entries which depend on the current input in MCTX.
4081    This function must be invoked when the matcher changes the start index
4082    of the input, or changes the input string.  */
4083
4084 static void
4085 internal_function
4086 match_ctx_clean (re_match_context_t *mctx)
4087 {
4088   int st_idx;
4089   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4090     {
4091       int sl_idx;
4092       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4093       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4094         {
4095           re_sub_match_last_t *last = top->lasts[sl_idx];
4096           re_free (last->path.array);
4097           re_free (last);
4098         }
4099       re_free (top->lasts);
4100       if (top->path)
4101         {
4102           re_free (top->path->array);
4103           re_free (top->path);
4104         }
4105       free (top);
4106     }
4107
4108   mctx->nsub_tops = 0;
4109   mctx->nbkref_ents = 0;
4110 }
4111
4112 /* Free all the memory associated with MCTX.  */
4113
4114 static void
4115 internal_function
4116 match_ctx_free (re_match_context_t *mctx)
4117 {
4118   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4119   match_ctx_clean (mctx);
4120   re_free (mctx->sub_tops);
4121   re_free (mctx->bkref_ents);
4122 }
4123
4124 /* Add a new backreference entry to MCTX.
4125    Note that we assume that caller never call this function with duplicate
4126    entry, and call with STR_IDX which isn't smaller than any existing entry.
4127 */
4128
4129 static reg_errcode_t
4130 internal_function
4131 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx,
4132                      int from, int to)
4133 {
4134   if (mctx->nbkref_ents >= mctx->abkref_ents)
4135     {
4136       struct re_backref_cache_entry* new_entry;
4137       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4138                               mctx->abkref_ents * 2);
4139       if (BE (new_entry == NULL, 0))
4140         {
4141           re_free (mctx->bkref_ents);
4142           return REG_ESPACE;
4143         }
4144       mctx->bkref_ents = new_entry;
4145       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4146               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4147       mctx->abkref_ents *= 2;
4148     }
4149   if (mctx->nbkref_ents > 0
4150       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4151     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4152
4153   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4154   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4155   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4156   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4157
4158   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4159      If bit N is clear, means that this entry won't epsilon-transition to
4160      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4161      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4162      such node.
4163
4164      A backreference does not epsilon-transition unless it is empty, so set
4165      to all zeros if FROM != TO.  */
4166   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4167     = (from == to ? ~0 : 0);
4168
4169   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4170   if (mctx->max_mb_elem_len < to - from)
4171     mctx->max_mb_elem_len = to - from;
4172   return REG_NOERROR;
4173 }
4174
4175 /* Search for the first entry which has the same str_idx, or -1 if none is
4176    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4177
4178 static int
4179 internal_function
4180 search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
4181 {
4182   int left, right, mid, last;
4183   last = right = mctx->nbkref_ents;
4184   for (left = 0; left < right;)
4185     {
4186       mid = (left + right) / 2;
4187       if (mctx->bkref_ents[mid].str_idx < str_idx)
4188         left = mid + 1;
4189       else
4190         right = mid;
4191     }
4192   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4193     return left;
4194   else
4195     return -1;
4196 }
4197
4198 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4199    at STR_IDX.  */
4200
4201 static reg_errcode_t
4202 internal_function
4203 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4204 {
4205 #ifdef DEBUG
4206   assert (mctx->sub_tops != NULL);
4207   assert (mctx->asub_tops > 0);
4208 #endif
4209   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4210     {
4211       int new_asub_tops = mctx->asub_tops * 2;
4212       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4213                                                    re_sub_match_top_t *,
4214                                                    new_asub_tops);
4215       if (BE (new_array == NULL, 0))
4216         return REG_ESPACE;
4217       mctx->sub_tops = new_array;
4218       mctx->asub_tops = new_asub_tops;
4219     }
4220   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4221   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4222     return REG_ESPACE;
4223   mctx->sub_tops[mctx->nsub_tops]->node = node;
4224   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4225   return REG_NOERROR;
4226 }
4227
4228 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4229    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4230
4231 static re_sub_match_last_t *
4232 internal_function
4233 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4234 {
4235   re_sub_match_last_t *new_entry;
4236   if (BE (subtop->nlasts == subtop->alasts, 0))
4237     {
4238       int new_alasts = 2 * subtop->alasts + 1;
4239       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4240                                                     re_sub_match_last_t *,
4241                                                     new_alasts);
4242       if (BE (new_array == NULL, 0))
4243         return NULL;
4244       subtop->lasts = new_array;
4245       subtop->alasts = new_alasts;
4246     }
4247   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4248   if (BE (new_entry != NULL, 1))
4249     {
4250       subtop->lasts[subtop->nlasts] = new_entry;
4251       new_entry->node = node;
4252       new_entry->str_idx = str_idx;
4253       ++subtop->nlasts;
4254     }
4255   return new_entry;
4256 }
4257
4258 static void
4259 internal_function
4260 sift_ctx_init (re_sift_context_t *sctx,
4261                re_dfastate_t **sifted_sts,
4262                re_dfastate_t **limited_sts,
4263                int last_node, int last_str_idx)
4264 {
4265   sctx->sifted_states = sifted_sts;
4266   sctx->limited_states = limited_sts;
4267   sctx->last_node = last_node;
4268   sctx->last_str_idx = last_str_idx;
4269   re_node_set_init_empty (&sctx->limits);
4270 }