lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static inline re_dfastate_t *acquire_init_state_context
  55      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  56      __attribute ((always_inline)) internal_function;
  57 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  58      internal_function;
  59 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  60                            int *p_match_first)
  61      internal_function;
  62 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  63                                     unsigned int context) internal_function;
  64 static int check_halt_state_context (const re_match_context_t *mctx,
  65                                      const re_dfastate_t *state, int idx)
  66      internal_function;
  67 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  68                          regmatch_t *prev_idx_match, int cur_node,
  69                          int cur_idx, int nmatch) internal_function;
  70 static int proceed_next_node (const re_match_context_t *mctx,
  71                               int nregs, regmatch_t *regs,
  72                               int *pidx, int node, re_node_set *eps_via_nodes,
  73                               struct re_fail_stack_t *fs) internal_function;
  74 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  75                                       int str_idx, int dest_node, int nregs,
  76                                       regmatch_t *regs,
  77                                       re_node_set *eps_via_nodes) internal_function;
  78 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  79                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  80 static reg_errcode_t set_regs (const regex_t *preg,
  81                                const re_match_context_t *mctx,
  82                                size_t nmatch, regmatch_t *pmatch,
  83                                int fl_backtrack) internal_function;
  84 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  85
  86 #ifdef RE_ENABLE_I18N
  87 static int sift_states_iter_mb (const re_match_context_t *mctx,
  88                                 re_sift_context_t *sctx,
  89                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  90 #endif /* RE_ENABLE_I18N */
  91 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  92                                            re_sift_context_t *sctx) internal_function;
  93 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  94                                           re_sift_context_t *sctx, int str_idx,
  95                                           re_node_set *cur_dest) internal_function;
  96 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  97                                               re_sift_context_t *sctx,
  98                                               int str_idx,
  99                                               re_node_set *dest_nodes) internal_function;
 100 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 101                                             re_node_set *dest_nodes,
 102                                             const re_node_set *candidates) internal_function;
 103 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 104                                             re_node_set *dest_nodes,
 105                                             const re_node_set *and_nodes) internal_function;
 106 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 107                              int dst_node, int dst_idx, int src_node,
 108                              int src_idx) internal_function;
 109 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 110                                         int boundaries, int subexp_idx,
 111                                         int from_node, int bkref_idx) internal_function;
 112 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 113                                       int limit, int subexp_idx,
 114                                       int node, int str_idx,
 115                                       int bkref_idx) internal_function;
 116 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 117                                           re_node_set *dest_nodes,
 118                                           const re_node_set *candidates,
 119                                           re_node_set *limits,
 120                                           struct re_backref_cache_entry *bkref_ents,
 121                                           int str_idx) internal_function;
 122 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 123                                         re_sift_context_t *sctx,
 124                                         int str_idx, const re_node_set *candidates) internal_function;
 125 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 126                                                 int next_state_log_idx) internal_function;
 127 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 128                                         re_dfastate_t **src, int num) internal_function;
 129 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 130                                          re_match_context_t *mctx) internal_function;
 131 static re_dfastate_t *transit_state (reg_errcode_t *err,
 132                                      re_match_context_t *mctx,
 133                                      re_dfastate_t *state) internal_function;
 134 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 135                                             re_match_context_t *mctx,
 136                                             re_dfastate_t *next_state) internal_function;
 137 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 138                                                 re_node_set *cur_nodes,
 139                                                 int str_idx) internal_function;
 140 #if 0
 141 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 142                                         re_match_context_t *mctx,
 143                                         re_dfastate_t *pstate) internal_function;
 144 #endif
 145 #ifdef RE_ENABLE_I18N
 146 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 147                                        re_dfastate_t *pstate) internal_function;
 148 #endif /* RE_ENABLE_I18N */
 149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 150                                           const re_node_set *nodes) internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx) internal_function;
 153 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 154                                      const re_sub_match_top_t *sub_top,
 155                                      re_sub_match_last_t *sub_last,
 156                                      int bkref_node, int bkref_str) internal_function;
 157 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 158                              int subexp_idx, int type) internal_function;
 159 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 160                                     state_array_t *path, int top_node,
 161                                     int top_str, int last_node, int last_str,
 162                                     int type) internal_function;
 163 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 164                                                    int str_idx,
 165                                                    re_node_set *cur_nodes,
 166                                                    re_node_set *next_nodes) internal_function;
 167 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 168                                                re_node_set *cur_nodes,
 169                                                int ex_subexp, int type) internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 171                                                    re_node_set *dst_nodes,
 172                                                    int target, int ex_subexp,
 173                                                    int type) internal_function;
 174 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 175                                          re_node_set *cur_nodes, int cur_str,
 176                                          int subexp_num, int type) internal_function;
 177 static int build_trtable (re_dfa_t *dfa,
 178                           re_dfastate_t *state) internal_function;
 179 #ifdef RE_ENABLE_I18N
 180 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 181                                     const re_string_t *input, int idx) internal_function;
 182 # ifdef _LIBC
 183 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 184                                                    size_t name_len) internal_function;
 185 # endif /* _LIBC */
 186 #endif /* RE_ENABLE_I18N */
 187 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 188                                        const re_dfastate_t *state,
 189                                        re_node_set *states_node,
 190                                        bitset *states_ch) internal_function;
 191 static int check_node_accept (const re_match_context_t *mctx,
 192                               const re_token_t *node, int idx) internal_function;
 193 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 194 \f
 195 /* Entry point for POSIX code.  */
 196
 197 /* regexec searches for a given pattern, specified by PREG, in the
 198    string STRING.
 199
 200    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 201    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 202    least NMATCH elements, and we set them to the offsets of the
 203    corresponding matched substrings.
 204
 205    EFLAGS specifies `execution flags' which affect matching: if
 206    REG_NOTBOL is set, then ^ does not match at the beginning of the
 207    string; if REG_NOTEOL is set, then $ does not match at the end.
 208
 209    We return 0 if we find a match and REG_NOMATCH if not.  */
 210
 211 int
 212 regexec (preg, string, nmatch, pmatch, eflags)
 213     const regex_t *__restrict preg;
 214     const char *__restrict string;
 215     size_t nmatch;
 216     regmatch_t pmatch[];
 217     int eflags;
 218 {
 219   reg_errcode_t err;
 220   int start, length;
 221   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 222
 223   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 224     return REG_BADPAT;
 225
 226   if (eflags & REG_STARTEND)
 227     {
 228       start = pmatch[0].rm_so;
 229       length = pmatch[0].rm_eo;
 230     }
 231   else
 232     {
 233       start = 0;
 234       length = strlen (string);
 235     }
 236
 237   __libc_lock_lock (dfa->lock);
 238   if (preg->no_sub)
 239     err = re_search_internal (preg, string, length, start, length - start,
 240                               length, 0, NULL, eflags);
 241   else
 242     err = re_search_internal (preg, string, length, start, length - start,
 243                               length, nmatch, pmatch, eflags);
 244   __libc_lock_unlock (dfa->lock);
 245   return err != REG_NOERROR;
 246 }
 247
 248 #ifdef _LIBC
 249 # include <shlib-compat.h>
 250 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 251
 252 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 253 __typeof__ (__regexec) __compat_regexec;
 254
 255 int
 256 attribute_compat_text_section
 257 __compat_regexec (const regex_t *__restrict preg,
 258                   const char *__restrict string, size_t nmatch,
 259                   regmatch_t pmatch[], int eflags)
 260 {
 261   return regexec (preg, string, nmatch, pmatch,
 262                   eflags & (REG_NOTBOL | REG_NOTEOL));
 263 }
 264 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 265 # endif
 266 #endif
 267
 268 /* Entry points for GNU code.  */
 269
 270 /* re_match, re_search, re_match_2, re_search_2
 271
 272    The former two functions operate on STRING with length LENGTH,
 273    while the later two operate on concatenation of STRING1 and STRING2
 274    with lengths LENGTH1 and LENGTH2, respectively.
 275
 276    re_match() matches the compiled pattern in BUFP against the string,
 277    starting at index START.
 278
 279    re_search() first tries matching at index START, then it tries to match
 280    starting from index START + 1, and so on.  The last start position tried
 281    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 282    way as re_match().)
 283
 284    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 285    the first STOP characters of the concatenation of the strings should be
 286    concerned.
 287
 288    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 289    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 290    computed relative to the concatenation, not relative to the individual
 291    strings.)
 292
 293    On success, re_match* functions return the length of the match, re_search*
 294    return the position of the start of the match.  Return value -1 means no
 295    match was found and -2 indicates an internal error.  */
 296
 297 int
 298 re_match (bufp, string, length, start, regs)
 299     struct re_pattern_buffer *bufp;
 300     const char *string;
 301     int length, start;
 302     struct re_registers *regs;
 303 {
 304   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 305 }
 306 #ifdef _LIBC
 307 weak_alias (__re_match, re_match)
 308 #endif
 309
 310 int
 311 re_search (bufp, string, length, start, range, regs)
 312     struct re_pattern_buffer *bufp;
 313     const char *string;
 314     int length, start, range;
 315     struct re_registers *regs;
 316 {
 317   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 318 }
 319 #ifdef _LIBC
 320 weak_alias (__re_search, re_search)
 321 #endif
 322
 323 int
 324 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 325     struct re_pattern_buffer *bufp;
 326     const char *string1, *string2;
 327     int length1, length2, start, stop;
 328     struct re_registers *regs;
 329 {
 330   return re_search_2_stub (bufp, string1, length1, string2, length2,
 331                            start, 0, regs, stop, 1);
 332 }
 333 #ifdef _LIBC
 334 weak_alias (__re_match_2, re_match_2)
 335 #endif
 336
 337 int
 338 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 339     struct re_pattern_buffer *bufp;
 340     const char *string1, *string2;
 341     int length1, length2, start, range, stop;
 342     struct re_registers *regs;
 343 {
 344   return re_search_2_stub (bufp, string1, length1, string2, length2,
 345                            start, range, regs, stop, 0);
 346 }
 347 #ifdef _LIBC
 348 weak_alias (__re_search_2, re_search_2)
 349 #endif
 350
 351 static int
 352 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 353                   stop, ret_len)
 354     struct re_pattern_buffer *bufp;
 355     const char *string1, *string2;
 356     int length1, length2, start, range, stop, ret_len;
 357     struct re_registers *regs;
 358 {
 359   const char *str;
 360   int rval;
 361   int len = length1 + length2;
 362   int free_str = 0;
 363
 364   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 365     return -2;
 366
 367   /* Concatenate the strings.  */
 368   if (length2 > 0)
 369     if (length1 > 0)
 370       {
 371         char *s = re_malloc (char, len);
 372
 373         if (BE (s == NULL, 0))
 374           return -2;
 375         memcpy (s, string1, length1);
 376         memcpy (s + length1, string2, length2);
 377         str = s;
 378         free_str = 1;
 379       }
 380     else
 381       str = string2;
 382   else
 383     str = string1;
 384
 385   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 386                          ret_len);
 387   if (free_str)
 388     re_free ((char *) str);
 389   return rval;
 390 }
 391
 392 /* The parameters have the same meaning as those of re_search.
 393    Additional parameters:
 394    If RET_LEN is nonzero the length of the match is returned (re_match style);
 395    otherwise the position of the match is returned.  */
 396
 397 static int
 398 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 399     struct re_pattern_buffer *bufp;
 400     const char *string;
 401     int length, start, range, stop, ret_len;
 402     struct re_registers *regs;
 403 {
 404   reg_errcode_t result;
 405   regmatch_t *pmatch;
 406   int nregs, rval;
 407   int eflags = 0;
 408   re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
 409
 410   /* Check for out-of-range.  */
 411   if (BE (start < 0 || start > length, 0))
 412     return -1;
 413   if (BE (start + range > length, 0))
 414     range = length - start;
 415   else if (BE (start + range < 0, 0))
 416     range = -start;
 417
 418   __libc_lock_lock (dfa->lock);
 419
 420   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 421   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 422
 423   /* Compile fastmap if we haven't yet.  */
 424   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 425     re_compile_fastmap (bufp);
 426
 427   if (BE (bufp->no_sub, 0))
 428     regs = NULL;
 429
 430   /* We need at least 1 register.  */
 431   if (regs == NULL)
 432     nregs = 1;
 433   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 434                regs->num_regs < bufp->re_nsub + 1, 0))
 435     {
 436       nregs = regs->num_regs;
 437       if (BE (nregs < 1, 0))
 438         {
 439           /* Nothing can be copied to regs.  */
 440           regs = NULL;
 441           nregs = 1;
 442         }
 443     }
 444   else
 445     nregs = bufp->re_nsub + 1;
 446   pmatch = re_malloc (regmatch_t, nregs);
 447   if (BE (pmatch == NULL, 0))
 448     {
 449       rval = -2;
 450       goto out;
 451     }
 452
 453   result = re_search_internal (bufp, string, length, start, range, stop,
 454                                nregs, pmatch, eflags);
 455
 456   rval = 0;
 457
 458   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 459   if (result != REG_NOERROR)
 460     rval = -1;
 461   else if (regs != NULL)
 462     {
 463       /* If caller wants register contents data back, copy them.  */
 464       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 465                                            bufp->regs_allocated);
 466       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 467         rval = -2;
 468     }
 469
 470   if (BE (rval == 0, 1))
 471     {
 472       if (ret_len)
 473         {
 474           assert (pmatch[0].rm_so == start);
 475           rval = pmatch[0].rm_eo - start;
 476         }
 477       else
 478         rval = pmatch[0].rm_so;
 479     }
 480   re_free (pmatch);
 481  out:
 482   __libc_lock_unlock (dfa->lock);
 483   return rval;
 484 }
 485
 486 static unsigned
 487 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 488     struct re_registers *regs;
 489     regmatch_t *pmatch;
 490     int nregs, regs_allocated;
 491 {
 492   int rval = REGS_REALLOCATE;
 493   int i;
 494   int need_regs = nregs + 1;
 495   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 496      uses.  */
 497
 498   /* Have the register data arrays been allocated?  */
 499   if (regs_allocated == REGS_UNALLOCATED)
 500     { /* No.  So allocate them with malloc.  */
 501       regs->start = re_malloc (regoff_t, need_regs);
 502       regs->end = re_malloc (regoff_t, need_regs);
 503       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 504         return REGS_UNALLOCATED;
 505       regs->num_regs = need_regs;
 506     }
 507   else if (regs_allocated == REGS_REALLOCATE)
 508     { /* Yes.  If we need more elements than were already
 509          allocated, reallocate them.  If we need fewer, just
 510          leave it alone.  */
 511       if (BE (need_regs > regs->num_regs, 0))
 512         {
 513           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 514           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 515           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 516             return REGS_UNALLOCATED;
 517           regs->start = new_start;
 518           regs->end = new_end;
 519           regs->num_regs = need_regs;
 520         }
 521     }
 522   else
 523     {
 524       assert (regs_allocated == REGS_FIXED);
 525       /* This function may not be called with REGS_FIXED and nregs too big.  */
 526       assert (regs->num_regs >= nregs);
 527       rval = REGS_FIXED;
 528     }
 529
 530   /* Copy the regs.  */
 531   for (i = 0; i < nregs; ++i)
 532     {
 533       regs->start[i] = pmatch[i].rm_so;
 534       regs->end[i] = pmatch[i].rm_eo;
 535     }
 536   for ( ; i < regs->num_regs; ++i)
 537     regs->start[i] = regs->end[i] = -1;
 538
 539   return rval;
 540 }
 541
 542 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 543    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 544    this memory for recording register information.  STARTS and ENDS
 545    must be allocated using the malloc library routine, and must each
 546    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 547
 548    If NUM_REGS == 0, then subsequent matches should allocate their own
 549    register data.
 550
 551    Unless this function is called, the first search or match using
 552    PATTERN_BUFFER will allocate its own register data, without
 553    freeing the old data.  */
 554
 555 void
 556 re_set_registers (bufp, regs, num_regs, starts, ends)
 557     struct re_pattern_buffer *bufp;
 558     struct re_registers *regs;
 559     unsigned num_regs;
 560     regoff_t *starts, *ends;
 561 {
 562   if (num_regs)
 563     {
 564       bufp->regs_allocated = REGS_REALLOCATE;
 565       regs->num_regs = num_regs;
 566       regs->start = starts;
 567       regs->end = ends;
 568     }
 569   else
 570     {
 571       bufp->regs_allocated = REGS_UNALLOCATED;
 572       regs->num_regs = 0;
 573       regs->start = regs->end = (regoff_t *) 0;
 574     }
 575 }
 576 #ifdef _LIBC
 577 weak_alias (__re_set_registers, re_set_registers)
 578 #endif
 579 \f
 580 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 581    them unless specifically requested.  */
 582
 583 #if defined _REGEX_RE_COMP || defined _LIBC
 584 int
 585 # ifdef _LIBC
 586 weak_function
 587 # endif
 588 re_exec (s)
 589      const char *s;
 590 {
 591   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 592 }
 593 #endif /* _REGEX_RE_COMP */
 594 \f
 595 /* Internal entry point.  */
 596
 597 /* Searches for a compiled pattern PREG in the string STRING, whose
 598    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 599    mingings with regexec.  START, and RANGE have the same meanings
 600    with re_search.
 601    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 602    otherwise return the error code.
 603    Note: We assume front end functions already check ranges.
 604    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 605
 606 static reg_errcode_t
 607 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 608                     eflags)
 609     const regex_t *preg;
 610     const char *string;
 611     int length, start, range, stop, eflags;
 612     size_t nmatch;
 613     regmatch_t pmatch[];
 614 {
 615   reg_errcode_t err;
 616   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 617   int left_lim, right_lim, incr;
 618   int fl_longest_match, match_first, match_kind, match_last = -1;
 619   int extra_nmatch;
 620   int sb, ch;
 621 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 622   re_match_context_t mctx = { .dfa = dfa };
 623 #else
 624   re_match_context_t mctx;
 625 #endif
 626   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 627                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 628   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 629
 630 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 631   memset (&mctx, '\0', sizeof (re_match_context_t));
 632   mctx.dfa = dfa;
 633 #endif
 634
 635   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 636   nmatch -= extra_nmatch;
 637
 638   /* Check if the DFA haven't been compiled.  */
 639   if (BE (preg->used == 0 || dfa->init_state == NULL
 640           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 641           || dfa->init_state_begbuf == NULL, 0))
 642     return REG_NOMATCH;
 643
 644 #ifdef DEBUG
 645   /* We assume front-end functions already check them.  */
 646   assert (start + range >= 0 && start + range <= length);
 647 #endif
 648
 649   /* If initial states with non-begbuf contexts have no elements,
 650      the regex must be anchored.  If preg->newline_anchor is set,
 651      we'll never use init_state_nl, so do not check it.  */
 652   if (dfa->init_state->nodes.nelem == 0
 653       && dfa->init_state_word->nodes.nelem == 0
 654       && (dfa->init_state_nl->nodes.nelem == 0
 655           || !preg->newline_anchor))
 656     {
 657       if (start != 0 && start + range != 0)
 658         return REG_NOMATCH;
 659       start = range = 0;
 660     }
 661
 662   /* We must check the longest matching, if nmatch > 0.  */
 663   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 664
 665   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 666                             preg->translate, preg->syntax & RE_ICASE, dfa);
 667   if (BE (err != REG_NOERROR, 0))
 668     goto free_return;
 669   mctx.input.stop = stop;
 670   mctx.input.raw_stop = stop;
 671   mctx.input.newline_anchor = preg->newline_anchor;
 672
 673   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 674   if (BE (err != REG_NOERROR, 0))
 675     goto free_return;
 676
 677   /* We will log all the DFA states through which the dfa pass,
 678      if nmatch > 1, or this dfa has "multibyte node", which is a
 679      back-reference or a node which can accept multibyte character or
 680      multi character collating element.  */
 681   if (nmatch > 1 || dfa->has_mb_node)
 682     {
 683       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 684       if (BE (mctx.state_log == NULL, 0))
 685         {
 686           err = REG_ESPACE;
 687           goto free_return;
 688         }
 689     }
 690   else
 691     mctx.state_log = NULL;
 692
 693   match_first = start;
 694   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 695                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 696
 697   /* Check incrementally whether of not the input string match.  */
 698   incr = (range < 0) ? -1 : 1;
 699   left_lim = (range < 0) ? start + range : start;
 700   right_lim = (range < 0) ? start : start + range;
 701   sb = dfa->mb_cur_max == 1;
 702   match_kind =
 703     (fastmap
 704      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 705         | (range >= 0 ? 2 : 0)
 706         | (t != NULL ? 1 : 0))
 707      : 8);
 708
 709   for (;; match_first += incr)
 710     {
 711       err = REG_NOMATCH;
 712       if (match_first < left_lim || right_lim < match_first)
 713         goto free_return;
 714
 715       /* Advance as rapidly as possible through the string, until we
 716          find a plausible place to start matching.  This may be done
 717          with varying efficiency, so there are various possibilities:
 718          only the most common of them are specialized, in order to
 719          save on code size.  We use a switch statement for speed.  */
 720       switch (match_kind)
 721         {
 722         case 8:
 723           /* No fastmap.  */
 724           break;
 725
 726         case 7:
 727           /* Fastmap with single-byte translation, match forward.  */
 728           while (BE (match_first < right_lim, 1)
 729                  && !fastmap[t[(unsigned char) string[match_first]]])
 730             ++match_first;
 731           goto forward_match_found_start_or_reached_end;
 732
 733         case 6:
 734           /* Fastmap without translation, match forward.  */
 735           while (BE (match_first < right_lim, 1)
 736                  && !fastmap[(unsigned char) string[match_first]])
 737             ++match_first;
 738
 739         forward_match_found_start_or_reached_end:
 740           if (BE (match_first == right_lim, 0))
 741             {
 742               ch = match_first >= length
 743                        ? 0 : (unsigned char) string[match_first];
 744               if (!fastmap[t ? t[ch] : ch])
 745                 goto free_return;
 746             }
 747           break;
 748
 749         case 4:
 750         case 5:
 751           /* Fastmap without multi-byte translation, match backwards.  */
 752           while (match_first >= left_lim)
 753             {
 754               ch = match_first >= length
 755                        ? 0 : (unsigned char) string[match_first];
 756               if (fastmap[t ? t[ch] : ch])
 757                 break;
 758               --match_first;
 759             }
 760           if (match_first < left_lim)
 761             goto free_return;
 762           break;
 763
 764         default:
 765           /* In this case, we can't determine easily the current byte,
 766              since it might be a component byte of a multibyte
 767              character.  Then we use the constructed buffer instead.  */
 768           for (;;)
 769             {
 770               /* If MATCH_FIRST is out of the valid range, reconstruct the
 771                  buffers.  */
 772               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 773               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 774                 {
 775                   err = re_string_reconstruct (&mctx.input, match_first,
 776                                                eflags);
 777                   if (BE (err != REG_NOERROR, 0))
 778                     goto free_return;
 779
 780                   offset = match_first - mctx.input.raw_mbs_idx;
 781                 }
 782               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 783                  Note that MATCH_FIRST must not be smaller than 0.  */
 784               ch = (match_first >= length
 785                     ? 0 : re_string_byte_at (&mctx.input, offset));
 786               if (fastmap[ch])
 787                 break;
 788               match_first += incr;
 789               if (match_first < left_lim || match_first > right_lim)
 790                 {
 791                   err = REG_NOMATCH;
 792                   goto free_return;
 793                 }
 794             }
 795           break;
 796         }
 797
 798       /* Reconstruct the buffers so that the matcher can assume that
 799          the matching starts from the beginning of the buffer.  */
 800       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 801       if (BE (err != REG_NOERROR, 0))
 802         goto free_return;
 803
 804 #ifdef RE_ENABLE_I18N
 805      /* Don't consider this char as a possible match start if it part,
 806         yet isn't the head, of a multibyte character.  */
 807       if (!sb && !re_string_first_byte (&mctx.input, 0))
 808         continue;
 809 #endif
 810
 811       /* It seems to be appropriate one, then use the matcher.  */
 812       /* We assume that the matching starts from 0.  */
 813       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 814       match_last = check_matching (&mctx, fl_longest_match,
 815                                    range >= 0 ? &match_first : NULL);
 816       if (match_last != -1)
 817         {
 818           if (BE (match_last == -2, 0))
 819             {
 820               err = REG_ESPACE;
 821               goto free_return;
 822             }
 823           else
 824             {
 825               mctx.match_last = match_last;
 826               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 827                 {
 828                   re_dfastate_t *pstate = mctx.state_log[match_last];
 829                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 830                                                              match_last);
 831                 }
 832               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 833                   || dfa->nbackref)
 834                 {
 835                   err = prune_impossible_nodes (&mctx);
 836                   if (err == REG_NOERROR)
 837                     break;
 838                   if (BE (err != REG_NOMATCH, 0))
 839                     goto free_return;
 840                   match_last = -1;
 841                 }
 842               else
 843                 break; /* We found a match.  */
 844             }
 845         }
 846
 847       match_ctx_clean (&mctx);
 848     }
 849
 850 #ifdef DEBUG
 851   assert (match_last != -1);
 852   assert (err == REG_NOERROR);
 853 #endif
 854
 855   /* Set pmatch[] if we need.  */
 856   if (nmatch > 0)
 857     {
 858       int reg_idx;
 859
 860       /* Initialize registers.  */
 861       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 862         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 863
 864       /* Set the points where matching start/end.  */
 865       pmatch[0].rm_so = 0;
 866       pmatch[0].rm_eo = mctx.match_last;
 867
 868       if (!preg->no_sub && nmatch > 1)
 869         {
 870           err = set_regs (preg, &mctx, nmatch, pmatch,
 871                           dfa->has_plural_match && dfa->nbackref > 0);
 872           if (BE (err != REG_NOERROR, 0))
 873             goto free_return;
 874         }
 875
 876       /* At last, add the offset to the each registers, since we slided
 877          the buffers so that we could assume that the matching starts
 878          from 0.  */
 879       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 880         if (pmatch[reg_idx].rm_so != -1)
 881           {
 882 #ifdef RE_ENABLE_I18N
 883             if (BE (mctx.input.offsets_needed != 0, 0))
 884               {
 885                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 886                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 887                 else
 888                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 889                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 890                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 891                 else
 892                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 893               }
 894 #else
 895             assert (mctx.input.offsets_needed == 0);
 896 #endif
 897             pmatch[reg_idx].rm_so += match_first;
 898             pmatch[reg_idx].rm_eo += match_first;
 899           }
 900       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 901         {
 902           pmatch[nmatch + reg_idx].rm_so = -1;
 903           pmatch[nmatch + reg_idx].rm_eo = -1;
 904         }
 905
 906       if (dfa->subexp_map)
 907         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 908           if (dfa->subexp_map[reg_idx] != reg_idx)
 909             {
 910               pmatch[reg_idx + 1].rm_so
 911                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 912               pmatch[reg_idx + 1].rm_eo
 913                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 914             }
 915     }
 916
 917  free_return:
 918   re_free (mctx.state_log);
 919   if (dfa->nbackref)
 920     match_ctx_free (&mctx);
 921   re_string_destruct (&mctx.input);
 922   return err;
 923 }
 924
 925 static reg_errcode_t
 926 prune_impossible_nodes (mctx)
 927      re_match_context_t *mctx;
 928 {
 929   re_dfa_t *const dfa = mctx->dfa;
 930   int halt_node, match_last;
 931   reg_errcode_t ret;
 932   re_dfastate_t **sifted_states;
 933   re_dfastate_t **lim_states = NULL;
 934   re_sift_context_t sctx;
 935 #ifdef DEBUG
 936   assert (mctx->state_log != NULL);
 937 #endif
 938   match_last = mctx->match_last;
 939   halt_node = mctx->last_node;
 940   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 941   if (BE (sifted_states == NULL, 0))
 942     {
 943       ret = REG_ESPACE;
 944       goto free_return;
 945     }
 946   if (dfa->nbackref)
 947     {
 948       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 949       if (BE (lim_states == NULL, 0))
 950         {
 951           ret = REG_ESPACE;
 952           goto free_return;
 953         }
 954       while (1)
 955         {
 956           memset (lim_states, '\0',
 957                   sizeof (re_dfastate_t *) * (match_last + 1));
 958           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 959                          match_last);
 960           ret = sift_states_backward (mctx, &sctx);
 961           re_node_set_free (&sctx.limits);
 962           if (BE (ret != REG_NOERROR, 0))
 963               goto free_return;
 964           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 965             break;
 966           do
 967             {
 968               --match_last;
 969               if (match_last < 0)
 970                 {
 971                   ret = REG_NOMATCH;
 972                   goto free_return;
 973                 }
 974             } while (mctx->state_log[match_last] == NULL
 975                      || !mctx->state_log[match_last]->halt);
 976           halt_node = check_halt_state_context (mctx,
 977                                                 mctx->state_log[match_last],
 978                                                 match_last);
 979         }
 980       ret = merge_state_array (dfa, sifted_states, lim_states,
 981                                match_last + 1);
 982       re_free (lim_states);
 983       lim_states = NULL;
 984       if (BE (ret != REG_NOERROR, 0))
 985         goto free_return;
 986     }
 987   else
 988     {
 989       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 990       ret = sift_states_backward (mctx, &sctx);
 991       re_node_set_free (&sctx.limits);
 992       if (BE (ret != REG_NOERROR, 0))
 993         goto free_return;
 994     }
 995   re_free (mctx->state_log);
 996   mctx->state_log = sifted_states;
 997   sifted_states = NULL;
 998   mctx->last_node = halt_node;
 999   mctx->match_last = match_last;
1000   ret = REG_NOERROR;
1001  free_return:
1002   re_free (sifted_states);
1003   re_free (lim_states);
1004   return ret;
1005 }
1006
1007 /* Acquire an initial state and return it.
1008    We must select appropriate initial state depending on the context,
1009    since initial states may have constraints like "\<", "^", etc..  */
1010
1011 static inline re_dfastate_t *
1012 acquire_init_state_context (err, mctx, idx)
1013      reg_errcode_t *err;
1014      const re_match_context_t *mctx;
1015      int idx;
1016 {
1017   re_dfa_t *const dfa = mctx->dfa;
1018   if (dfa->init_state->has_constraint)
1019     {
1020       unsigned int context;
1021       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1022       if (IS_WORD_CONTEXT (context))
1023         return dfa->init_state_word;
1024       else if (IS_ORDINARY_CONTEXT (context))
1025         return dfa->init_state;
1026       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1027         return dfa->init_state_begbuf;
1028       else if (IS_NEWLINE_CONTEXT (context))
1029         return dfa->init_state_nl;
1030       else if (IS_BEGBUF_CONTEXT (context))
1031         {
1032           /* It is relatively rare case, then calculate on demand.  */
1033           return re_acquire_state_context (err, dfa,
1034                                            dfa->init_state->entrance_nodes,
1035                                            context);
1036         }
1037       else
1038         /* Must not happen?  */
1039         return dfa->init_state;
1040     }
1041   else
1042     return dfa->init_state;
1043 }
1044
1045 /* Check whether the regular expression match input string INPUT or not,
1046    and return the index where the matching end, return -1 if not match,
1047    or return -2 in case of an error.
1048    FL_LONGEST_MATCH means we want the POSIX longest matching.
1049    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1050    next place where we may want to try matching.
1051    Note that the matcher assume that the maching starts from the current
1052    index of the buffer.  */
1053
1054 static int
1055 check_matching (mctx, fl_longest_match, p_match_first)
1056     re_match_context_t *mctx;
1057     int fl_longest_match;
1058     int *p_match_first;
1059 {
1060   re_dfa_t *const dfa = mctx->dfa;
1061   reg_errcode_t err;
1062   int match = 0;
1063   int match_last = -1;
1064   int cur_str_idx = re_string_cur_idx (&mctx->input);
1065   re_dfastate_t *cur_state;
1066   int at_init_state = p_match_first != NULL;
1067   int next_start_idx = cur_str_idx;
1068
1069   err = REG_NOERROR;
1070   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1071   /* An initial state must not be NULL (invalid).  */
1072   if (BE (cur_state == NULL, 0))
1073     {
1074       assert (err == REG_ESPACE);
1075       return -2;
1076     }
1077
1078   if (mctx->state_log != NULL)
1079     {
1080       mctx->state_log[cur_str_idx] = cur_state;
1081
1082       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1083          later.  E.g. Processing back references.  */
1084       if (BE (dfa->nbackref, 0))
1085         {
1086           at_init_state = 0;
1087           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1088           if (BE (err != REG_NOERROR, 0))
1089             return err;
1090
1091           if (cur_state->has_backref)
1092             {
1093               err = transit_state_bkref (mctx, &cur_state->nodes);
1094               if (BE (err != REG_NOERROR, 0))
1095                 return err;
1096             }
1097         }
1098     }
1099
1100   /* If the RE accepts NULL string.  */
1101   if (BE (cur_state->halt, 0))
1102     {
1103       if (!cur_state->has_constraint
1104           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1105         {
1106           if (!fl_longest_match)
1107             return cur_str_idx;
1108           else
1109             {
1110               match_last = cur_str_idx;
1111               match = 1;
1112             }
1113         }
1114     }
1115
1116   while (!re_string_eoi (&mctx->input))
1117     {
1118       re_dfastate_t *old_state = cur_state;
1119       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1120
1121       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1122           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1123               && mctx->input.valid_len < mctx->input.len))
1124         {
1125           err = extend_buffers (mctx);
1126           if (BE (err != REG_NOERROR, 0))
1127             {
1128               assert (err == REG_ESPACE);
1129               return -2;
1130             }
1131         }
1132
1133       cur_state = transit_state (&err, mctx, cur_state);
1134       if (mctx->state_log != NULL)
1135         cur_state = merge_state_with_log (&err, mctx, cur_state);
1136
1137       if (cur_state == NULL)
1138         {
1139           /* Reached the invalid state or an error.  Try to recover a valid
1140              state using the state log, if available and if we have not
1141              already found a valid (even if not the longest) match.  */
1142           if (BE (err != REG_NOERROR, 0))
1143             return -2;
1144
1145           if (mctx->state_log == NULL
1146               || (match && !fl_longest_match)
1147               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1148             break;
1149         }
1150
1151       if (BE (at_init_state, 0))
1152         {
1153           if (old_state == cur_state)
1154             next_start_idx = next_char_idx;
1155           else
1156             at_init_state = 0;
1157         }
1158
1159       if (cur_state->halt)
1160         {
1161           /* Reached a halt state.
1162              Check the halt state can satisfy the current context.  */
1163           if (!cur_state->has_constraint
1164               || check_halt_state_context (mctx, cur_state,
1165                                            re_string_cur_idx (&mctx->input)))
1166             {
1167               /* We found an appropriate halt state.  */
1168               match_last = re_string_cur_idx (&mctx->input);
1169               match = 1;
1170
1171               /* We found a match, do not modify match_first below.  */
1172               p_match_first = NULL;
1173               if (!fl_longest_match)
1174                 break;
1175             }
1176         }
1177     }
1178
1179   if (p_match_first)
1180     *p_match_first += next_start_idx;
1181
1182   return match_last;
1183 }
1184
1185 /* Check NODE match the current context.  */
1186
1187 static int check_halt_node_context (dfa, node, context)
1188     const re_dfa_t *dfa;
1189     int node;
1190     unsigned int context;
1191 {
1192   re_token_type_t type = dfa->nodes[node].type;
1193   unsigned int constraint = dfa->nodes[node].constraint;
1194   if (type != END_OF_RE)
1195     return 0;
1196   if (!constraint)
1197     return 1;
1198   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1199     return 0;
1200   return 1;
1201 }
1202
1203 /* Check the halt state STATE match the current context.
1204    Return 0 if not match, if the node, STATE has, is a halt node and
1205    match the context, return the node.  */
1206
1207 static int
1208 check_halt_state_context (mctx, state, idx)
1209     const re_match_context_t *mctx;
1210     const re_dfastate_t *state;
1211     int idx;
1212 {
1213   int i;
1214   unsigned int context;
1215 #ifdef DEBUG
1216   assert (state->halt);
1217 #endif
1218   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1219   for (i = 0; i < state->nodes.nelem; ++i)
1220     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1221       return state->nodes.elems[i];
1222   return 0;
1223 }
1224
1225 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1226    corresponding to the DFA).
1227    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1228    of errors.  */
1229
1230 static int
1231 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1232     const re_match_context_t *mctx;
1233     regmatch_t *regs;
1234     int nregs, *pidx, node;
1235     re_node_set *eps_via_nodes;
1236     struct re_fail_stack_t *fs;
1237 {
1238   re_dfa_t *const dfa = mctx->dfa;
1239   int i, err, dest_node;
1240   dest_node = -1;
1241   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1242     {
1243       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1244       re_node_set *edests = &dfa->edests[node];
1245       int dest_node;
1246       err = re_node_set_insert (eps_via_nodes, node);
1247       if (BE (err < 0, 0))
1248         return -2;
1249       /* Pick up a valid destination, or return -1 if none is found.  */
1250       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1251         {
1252           int candidate = edests->elems[i];
1253           if (!re_node_set_contains (cur_nodes, candidate))
1254             continue;
1255           if (dest_node == -1)
1256             dest_node = candidate;
1257
1258           else
1259             {
1260               /* In order to avoid infinite loop like "(a*)*", return the second
1261                  epsilon-transition if the first was already considered.  */
1262               if (re_node_set_contains (eps_via_nodes, dest_node))
1263                 return candidate;
1264
1265               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1266               else if (fs != NULL
1267                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1268                                            eps_via_nodes))
1269                 return -2;
1270
1271               /* We know we are going to exit.  */
1272               break;
1273             }
1274         }
1275       return dest_node;
1276     }
1277   else
1278     {
1279       int naccepted = 0;
1280       re_token_type_t type = dfa->nodes[node].type;
1281
1282 #ifdef RE_ENABLE_I18N
1283       if (dfa->nodes[node].accept_mb)
1284         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1285       else
1286 #endif /* RE_ENABLE_I18N */
1287       if (type == OP_BACK_REF)
1288         {
1289           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1290           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1291           if (fs != NULL)
1292             {
1293               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1294                 return -1;
1295               else if (naccepted)
1296                 {
1297                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1298                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1299                               naccepted) != 0)
1300                     return -1;
1301                 }
1302             }
1303
1304           if (naccepted == 0)
1305             {
1306               err = re_node_set_insert (eps_via_nodes, node);
1307               if (BE (err < 0, 0))
1308                 return -2;
1309               dest_node = dfa->edests[node].elems[0];
1310               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1311                                         dest_node))
1312                 return dest_node;
1313             }
1314         }
1315
1316       if (naccepted != 0
1317           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1318         {
1319           dest_node = dfa->nexts[node];
1320           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1321           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1322                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1323                                                dest_node)))
1324             return -1;
1325           re_node_set_empty (eps_via_nodes);
1326           return dest_node;
1327         }
1328     }
1329   return -1;
1330 }
1331
1332 static reg_errcode_t
1333 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1334      struct re_fail_stack_t *fs;
1335      int str_idx, dest_node, nregs;
1336      regmatch_t *regs;
1337      re_node_set *eps_via_nodes;
1338 {
1339   reg_errcode_t err;
1340   int num = fs->num++;
1341   if (fs->num == fs->alloc)
1342     {
1343       struct re_fail_stack_ent_t *new_array;
1344       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1345                                        * fs->alloc * 2));
1346       if (new_array == NULL)
1347         return REG_ESPACE;
1348       fs->alloc *= 2;
1349       fs->stack = new_array;
1350     }
1351   fs->stack[num].idx = str_idx;
1352   fs->stack[num].node = dest_node;
1353   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1354   if (fs->stack[num].regs == NULL)
1355     return REG_ESPACE;
1356   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1357   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1358   return err;
1359 }
1360
1361 static int
1362 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1363      struct re_fail_stack_t *fs;
1364      int *pidx, nregs;
1365      regmatch_t *regs;
1366      re_node_set *eps_via_nodes;
1367 {
1368   int num = --fs->num;
1369   assert (num >= 0);
1370   *pidx = fs->stack[num].idx;
1371   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1372   re_node_set_free (eps_via_nodes);
1373   re_free (fs->stack[num].regs);
1374   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1375   return fs->stack[num].node;
1376 }
1377
1378 /* Set the positions where the subexpressions are starts/ends to registers
1379    PMATCH.
1380    Note: We assume that pmatch[0] is already set, and
1381    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1382
1383 static reg_errcode_t
1384 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1385      const regex_t *preg;
1386      const re_match_context_t *mctx;
1387      size_t nmatch;
1388      regmatch_t *pmatch;
1389      int fl_backtrack;
1390 {
1391   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1392   int idx, cur_node;
1393   re_node_set eps_via_nodes;
1394   struct re_fail_stack_t *fs;
1395   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1396   regmatch_t *prev_idx_match;
1397
1398 #ifdef DEBUG
1399   assert (nmatch > 1);
1400   assert (mctx->state_log != NULL);
1401 #endif
1402   if (fl_backtrack)
1403     {
1404       fs = &fs_body;
1405       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1406       if (fs->stack == NULL)
1407         return REG_ESPACE;
1408     }
1409   else
1410     fs = NULL;
1411
1412   cur_node = dfa->init_node;
1413   re_node_set_init_empty (&eps_via_nodes);
1414
1415   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1416   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1417
1418   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1419     {
1420       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1421
1422       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1423         {
1424           int reg_idx;
1425           if (fs)
1426             {
1427               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1428                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1429                   break;
1430               if (reg_idx == nmatch)
1431                 {
1432                   re_node_set_free (&eps_via_nodes);
1433                   return free_fail_stack_return (fs);
1434                 }
1435               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1436                                          &eps_via_nodes);
1437             }
1438           else
1439             {
1440               re_node_set_free (&eps_via_nodes);
1441               return REG_NOERROR;
1442             }
1443         }
1444
1445       /* Proceed to next node.  */
1446       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1447                                     &eps_via_nodes, fs);
1448
1449       if (BE (cur_node < 0, 0))
1450         {
1451           if (BE (cur_node == -2, 0))
1452             {
1453               re_node_set_free (&eps_via_nodes);
1454               free_fail_stack_return (fs);
1455               return REG_ESPACE;
1456             }
1457           if (fs)
1458             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1459                                        &eps_via_nodes);
1460           else
1461             {
1462               re_node_set_free (&eps_via_nodes);
1463               return REG_NOMATCH;
1464             }
1465         }
1466     }
1467   re_node_set_free (&eps_via_nodes);
1468   return free_fail_stack_return (fs);
1469 }
1470
1471 static reg_errcode_t
1472 free_fail_stack_return (fs)
1473      struct re_fail_stack_t *fs;
1474 {
1475   if (fs)
1476     {
1477       int fs_idx;
1478       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1479         {
1480           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1481           re_free (fs->stack[fs_idx].regs);
1482         }
1483       re_free (fs->stack);
1484     }
1485   return REG_NOERROR;
1486 }
1487
1488 static void
1489 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1490      re_dfa_t *dfa;
1491      regmatch_t *pmatch, *prev_idx_match;
1492      int cur_node, cur_idx, nmatch;
1493 {
1494   int type = dfa->nodes[cur_node].type;
1495   if (type == OP_OPEN_SUBEXP)
1496     {
1497       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1498
1499       /* We are at the first node of this sub expression.  */
1500       if (reg_num < nmatch)
1501         {
1502           pmatch[reg_num].rm_so = cur_idx;
1503           pmatch[reg_num].rm_eo = -1;
1504         }
1505     }
1506   else if (type == OP_CLOSE_SUBEXP)
1507     {
1508       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1509       if (reg_num < nmatch)
1510         {
1511           /* We are at the last node of this sub expression.  */
1512           if (pmatch[reg_num].rm_so < cur_idx)
1513             {
1514               pmatch[reg_num].rm_eo = cur_idx;
1515               /* This is a non-empty match or we are not inside an optional
1516                  subexpression.  Accept this right away.  */
1517               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1518             }
1519           else
1520             {
1521               if (dfa->nodes[cur_node].opt_subexp
1522                   && prev_idx_match[reg_num].rm_so != -1)
1523                 /* We transited through an empty match for an optional
1524                    subexpression, like (a?)*, and this is not the subexp's
1525                    first match.  Copy back the old content of the registers
1526                    so that matches of an inner subexpression are undone as
1527                    well, like in ((a?))*.  */
1528                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1529               else
1530                 /* We completed a subexpression, but it may be part of
1531                    an optional one, so do not update PREV_IDX_MATCH.  */
1532                 pmatch[reg_num].rm_eo = cur_idx;
1533             }
1534         }
1535     }
1536 }
1537
1538 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1539    and sift the nodes in each states according to the following rules.
1540    Updated state_log will be wrote to STATE_LOG.
1541
1542    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1543      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1544         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1545         the LAST_NODE, we throw away the node `a'.
1546      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1547         string `s' and transit to `b':
1548         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1549            away the node `a'.
1550         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1551             thrown away, we throw away the node `a'.
1552      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1553         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1554            node `a'.
1555         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1556             we throw away the node `a'.  */
1557
1558 #define STATE_NODE_CONTAINS(state,node) \
1559   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1560
1561 static reg_errcode_t
1562 sift_states_backward (mctx, sctx)
1563      re_match_context_t *mctx;
1564      re_sift_context_t *sctx;
1565 {
1566   reg_errcode_t err;
1567   int null_cnt = 0;
1568   int str_idx = sctx->last_str_idx;
1569   re_node_set cur_dest;
1570
1571 #ifdef DEBUG
1572   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1573 #endif
1574
1575   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1576      transit to the last_node and the last_node itself.  */
1577   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1578   if (BE (err != REG_NOERROR, 0))
1579     return err;
1580   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1581   if (BE (err != REG_NOERROR, 0))
1582     goto free_return;
1583
1584   /* Then check each states in the state_log.  */
1585   while (str_idx > 0)
1586     {
1587       /* Update counters.  */
1588       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1589       if (null_cnt > mctx->max_mb_elem_len)
1590         {
1591           memset (sctx->sifted_states, '\0',
1592                   sizeof (re_dfastate_t *) * str_idx);
1593           re_node_set_free (&cur_dest);
1594           return REG_NOERROR;
1595         }
1596       re_node_set_empty (&cur_dest);
1597       --str_idx;
1598
1599       if (mctx->state_log[str_idx])
1600         {
1601           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1602           if (BE (err != REG_NOERROR, 0))
1603             goto free_return;
1604         }
1605
1606       /* Add all the nodes which satisfy the following conditions:
1607          - It can epsilon transit to a node in CUR_DEST.
1608          - It is in CUR_SRC.
1609          And update state_log.  */
1610       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1611       if (BE (err != REG_NOERROR, 0))
1612         goto free_return;
1613     }
1614   err = REG_NOERROR;
1615  free_return:
1616   re_node_set_free (&cur_dest);
1617   return err;
1618 }
1619
1620 static reg_errcode_t
1621 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1622      re_match_context_t *mctx;
1623      re_sift_context_t *sctx;
1624      int str_idx;
1625      re_node_set *cur_dest;
1626 {
1627   re_dfa_t *const dfa = mctx->dfa;
1628   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1629   int i;
1630
1631   /* Then build the next sifted state.
1632      We build the next sifted state on `cur_dest', and update
1633      `sifted_states[str_idx]' with `cur_dest'.
1634      Note:
1635      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1636      `cur_src' points the node_set of the old `state_log[str_idx]'
1637      (with the epsilon nodes pre-filtered out).  */
1638   for (i = 0; i < cur_src->nelem; i++)
1639     {
1640       int prev_node = cur_src->elems[i];
1641       int naccepted = 0;
1642       int ret;
1643
1644 #ifdef DEBUG
1645       re_token_type_t type = dfa->nodes[prev_node].type;
1646       assert (!IS_EPSILON_NODE (type));
1647 #endif
1648 #ifdef RE_ENABLE_I18N
1649       /* If the node may accept `multi byte'.  */
1650       if (dfa->nodes[prev_node].accept_mb)
1651         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1652                                          str_idx, sctx->last_str_idx);
1653 #endif /* RE_ENABLE_I18N */
1654
1655       /* We don't check backreferences here.
1656          See update_cur_sifted_state().  */
1657       if (!naccepted
1658           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1659           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1660                                   dfa->nexts[prev_node]))
1661         naccepted = 1;
1662
1663       if (naccepted == 0)
1664         continue;
1665
1666       if (sctx->limits.nelem)
1667         {
1668           int to_idx = str_idx + naccepted;
1669           if (check_dst_limits (mctx, &sctx->limits,
1670                                 dfa->nexts[prev_node], to_idx,
1671                                 prev_node, str_idx))
1672             continue;
1673         }
1674       ret = re_node_set_insert (cur_dest, prev_node);
1675       if (BE (ret == -1, 0))
1676         return REG_ESPACE;
1677     }
1678
1679   return REG_NOERROR;
1680 }
1681
1682 /* Helper functions.  */
1683
1684 static reg_errcode_t
1685 clean_state_log_if_needed (mctx, next_state_log_idx)
1686     re_match_context_t *mctx;
1687     int next_state_log_idx;
1688 {
1689   int top = mctx->state_log_top;
1690
1691   if (next_state_log_idx >= mctx->input.bufs_len
1692       || (next_state_log_idx >= mctx->input.valid_len
1693           && mctx->input.valid_len < mctx->input.len))
1694     {
1695       reg_errcode_t err;
1696       err = extend_buffers (mctx);
1697       if (BE (err != REG_NOERROR, 0))
1698         return err;
1699     }
1700
1701   if (top < next_state_log_idx)
1702     {
1703       memset (mctx->state_log + top + 1, '\0',
1704               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1705       mctx->state_log_top = next_state_log_idx;
1706     }
1707   return REG_NOERROR;
1708 }
1709
1710 static reg_errcode_t
1711 merge_state_array (dfa, dst, src, num)
1712      re_dfa_t *dfa;
1713      re_dfastate_t **dst;
1714      re_dfastate_t **src;
1715      int num;
1716 {
1717   int st_idx;
1718   reg_errcode_t err;
1719   for (st_idx = 0; st_idx < num; ++st_idx)
1720     {
1721       if (dst[st_idx] == NULL)
1722         dst[st_idx] = src[st_idx];
1723       else if (src[st_idx] != NULL)
1724         {
1725           re_node_set merged_set;
1726           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1727                                         &src[st_idx]->nodes);
1728           if (BE (err != REG_NOERROR, 0))
1729             return err;
1730           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1731           re_node_set_free (&merged_set);
1732           if (BE (err != REG_NOERROR, 0))
1733             return err;
1734         }
1735     }
1736   return REG_NOERROR;
1737 }
1738
1739 static reg_errcode_t
1740 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1741      re_match_context_t *mctx;
1742      re_sift_context_t *sctx;
1743      int str_idx;
1744      re_node_set *dest_nodes;
1745 {
1746   re_dfa_t *const dfa = mctx->dfa;
1747   reg_errcode_t err;
1748   const re_node_set *candidates;
1749   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1750                 : &mctx->state_log[str_idx]->nodes);
1751
1752   if (dest_nodes->nelem == 0)
1753     sctx->sifted_states[str_idx] = NULL;
1754   else
1755     {
1756       if (candidates)
1757         {
1758           /* At first, add the nodes which can epsilon transit to a node in
1759              DEST_NODE.  */
1760           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1761           if (BE (err != REG_NOERROR, 0))
1762             return err;
1763
1764           /* Then, check the limitations in the current sift_context.  */
1765           if (sctx->limits.nelem)
1766             {
1767               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1768                                          mctx->bkref_ents, str_idx);
1769               if (BE (err != REG_NOERROR, 0))
1770                 return err;
1771             }
1772         }
1773
1774       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1775       if (BE (err != REG_NOERROR, 0))
1776         return err;
1777     }
1778
1779   if (candidates && mctx->state_log[str_idx]->has_backref)
1780     {
1781       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1782       if (BE (err != REG_NOERROR, 0))
1783         return err;
1784     }
1785   return REG_NOERROR;
1786 }
1787
1788 static reg_errcode_t
1789 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1790      re_dfa_t *dfa;
1791      re_node_set *dest_nodes;
1792      const re_node_set *candidates;
1793 {
1794   reg_errcode_t err = REG_NOERROR;
1795   int i;
1796
1797   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1798   if (BE (err != REG_NOERROR, 0))
1799     return err;
1800
1801   if (!state->inveclosure.alloc)
1802     {
1803       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1804       if (BE (err != REG_NOERROR, 0))
1805         return REG_ESPACE;
1806       for (i = 0; i < dest_nodes->nelem; i++)
1807         re_node_set_merge (&state->inveclosure,
1808                            dfa->inveclosures + dest_nodes->elems[i]);
1809     }
1810   return re_node_set_add_intersect (dest_nodes, candidates,
1811                                     &state->inveclosure);
1812 }
1813
1814 static reg_errcode_t
1815 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1816      re_dfa_t *dfa;
1817      int node;
1818      re_node_set *dest_nodes;
1819      const re_node_set *candidates;
1820 {
1821     int ecl_idx;
1822     reg_errcode_t err;
1823     re_node_set *inv_eclosure = dfa->inveclosures + node;
1824     re_node_set except_nodes;
1825     re_node_set_init_empty (&except_nodes);
1826     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1827       {
1828         int cur_node = inv_eclosure->elems[ecl_idx];
1829         if (cur_node == node)
1830           continue;
1831         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1832           {
1833             int edst1 = dfa->edests[cur_node].elems[0];
1834             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1835                          ? dfa->edests[cur_node].elems[1] : -1);
1836             if ((!re_node_set_contains (inv_eclosure, edst1)
1837                  && re_node_set_contains (dest_nodes, edst1))
1838                 || (edst2 > 0
1839                     && !re_node_set_contains (inv_eclosure, edst2)
1840                     && re_node_set_contains (dest_nodes, edst2)))
1841               {
1842                 err = re_node_set_add_intersect (&except_nodes, candidates,
1843                                                  dfa->inveclosures + cur_node);
1844                 if (BE (err != REG_NOERROR, 0))
1845                   {
1846                     re_node_set_free (&except_nodes);
1847                     return err;
1848                   }
1849               }
1850           }
1851       }
1852     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1853       {
1854         int cur_node = inv_eclosure->elems[ecl_idx];
1855         if (!re_node_set_contains (&except_nodes, cur_node))
1856           {
1857             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1858             re_node_set_remove_at (dest_nodes, idx);
1859           }
1860       }
1861     re_node_set_free (&except_nodes);
1862     return REG_NOERROR;
1863 }
1864
1865 static int
1866 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1867      re_match_context_t *mctx;
1868      re_node_set *limits;
1869      int dst_node, dst_idx, src_node, src_idx;
1870 {
1871   re_dfa_t *const dfa = mctx->dfa;
1872   int lim_idx, src_pos, dst_pos;
1873
1874   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1875   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1876   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1877     {
1878       int subexp_idx;
1879       struct re_backref_cache_entry *ent;
1880       ent = mctx->bkref_ents + limits->elems[lim_idx];
1881       subexp_idx = dfa->nodes[ent->node].opr.idx;
1882
1883       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1884                                            subexp_idx, dst_node, dst_idx,
1885                                            dst_bkref_idx);
1886       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1887                                            subexp_idx, src_node, src_idx,
1888                                            src_bkref_idx);
1889
1890       /* In case of:
1891          <src> <dst> ( <subexp> )
1892          ( <subexp> ) <src> <dst>
1893          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1894       if (src_pos == dst_pos)
1895         continue; /* This is unrelated limitation.  */
1896       else
1897         return 1;
1898     }
1899   return 0;
1900 }
1901
1902 static int
1903 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1904      re_match_context_t *mctx;
1905      int boundaries, subexp_idx, from_node, bkref_idx;
1906 {
1907   re_dfa_t *const dfa = mctx->dfa;
1908   re_node_set *eclosures = dfa->eclosures + from_node;
1909   int node_idx;
1910
1911   /* Else, we are on the boundary: examine the nodes on the epsilon
1912      closure.  */
1913   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1914     {
1915       int node = eclosures->elems[node_idx];
1916       switch (dfa->nodes[node].type)
1917         {
1918         case OP_BACK_REF:
1919           if (bkref_idx != -1)
1920             {
1921               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1922               do
1923                 {
1924                   int dst, cpos;
1925
1926                   if (ent->node != node)
1927                     continue;
1928
1929                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1930                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1931                     continue;
1932
1933                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1934                      OP_CLOSE_SUBEXP cases below.  But, if the
1935                      destination node is the same node as the source
1936                      node, don't recurse because it would cause an
1937                      infinite loop: a regex that exhibits this behavior
1938                      is ()\1*\1*  */
1939                   dst = dfa->edests[node].elems[0];
1940                   if (dst == from_node)
1941                     {
1942                       if (boundaries & 1)
1943                         return -1;
1944                       else /* if (boundaries & 2) */
1945                         return 0;
1946                     }
1947
1948                   cpos =
1949                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1950                                                  dst, bkref_idx);
1951                   if (cpos == -1 /* && (boundaries & 1) */)
1952                     return -1;
1953                   if (cpos == 0 && (boundaries & 2))
1954                     return 0;
1955
1956                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1957                 }
1958               while (ent++->more);
1959             }
1960           break;
1961
1962         case OP_OPEN_SUBEXP:
1963           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1964             return -1;
1965           break;
1966
1967         case OP_CLOSE_SUBEXP:
1968           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1969             return 0;
1970           break;
1971
1972         default:
1973             break;
1974         }
1975     }
1976
1977   return (boundaries & 2) ? 1 : 0;
1978 }
1979
1980 static int
1981 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1982      re_match_context_t *mctx;
1983      int limit, subexp_idx, from_node, str_idx, bkref_idx;
1984 {
1985   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1986   int boundaries;
1987
1988   /* If we are outside the range of the subexpression, return -1 or 1.  */
1989   if (str_idx < lim->subexp_from)
1990     return -1;
1991
1992   if (lim->subexp_to < str_idx)
1993     return 1;
1994
1995   /* If we are within the subexpression, return 0.  */
1996   boundaries = (str_idx == lim->subexp_from);
1997   boundaries |= (str_idx == lim->subexp_to) << 1;
1998   if (boundaries == 0)
1999     return 0;
2000
2001   /* Else, examine epsilon closure.  */
2002   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2003                                       from_node, bkref_idx);
2004 }
2005
2006 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2007    which are against limitations from DEST_NODES. */
2008
2009 static reg_errcode_t
2010 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2011      re_dfa_t *dfa;
2012      re_node_set *dest_nodes;
2013      const re_node_set *candidates;
2014      re_node_set *limits;
2015      struct re_backref_cache_entry *bkref_ents;
2016      int str_idx;
2017 {
2018   reg_errcode_t err;
2019   int node_idx, lim_idx;
2020
2021   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2022     {
2023       int subexp_idx;
2024       struct re_backref_cache_entry *ent;
2025       ent = bkref_ents + limits->elems[lim_idx];
2026
2027       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2028         continue; /* This is unrelated limitation.  */
2029
2030       subexp_idx = dfa->nodes[ent->node].opr.idx;
2031       if (ent->subexp_to == str_idx)
2032         {
2033           int ops_node = -1;
2034           int cls_node = -1;
2035           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2036             {
2037               int node = dest_nodes->elems[node_idx];
2038               re_token_type_t type = dfa->nodes[node].type;
2039               if (type == OP_OPEN_SUBEXP
2040                   && subexp_idx == dfa->nodes[node].opr.idx)
2041                 ops_node = node;
2042               else if (type == OP_CLOSE_SUBEXP
2043                        && subexp_idx == dfa->nodes[node].opr.idx)
2044                 cls_node = node;
2045             }
2046
2047           /* Check the limitation of the open subexpression.  */
2048           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2049           if (ops_node >= 0)
2050             {
2051               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2052                                            candidates);
2053               if (BE (err != REG_NOERROR, 0))
2054                 return err;
2055             }
2056
2057           /* Check the limitation of the close subexpression.  */
2058           if (cls_node >= 0)
2059             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2060               {
2061                 int node = dest_nodes->elems[node_idx];
2062                 if (!re_node_set_contains (dfa->inveclosures + node,
2063                                            cls_node)
2064                     && !re_node_set_contains (dfa->eclosures + node,
2065                                               cls_node))
2066                   {
2067                     /* It is against this limitation.
2068                        Remove it form the current sifted state.  */
2069                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2070                                                  candidates);
2071                     if (BE (err != REG_NOERROR, 0))
2072                       return err;
2073                     --node_idx;
2074                   }
2075               }
2076         }
2077       else /* (ent->subexp_to != str_idx)  */
2078         {
2079           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2080             {
2081               int node = dest_nodes->elems[node_idx];
2082               re_token_type_t type = dfa->nodes[node].type;
2083               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2084                 {
2085                   if (subexp_idx != dfa->nodes[node].opr.idx)
2086                     continue;
2087                   /* It is against this limitation.
2088                      Remove it form the current sifted state.  */
2089                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2090                                                candidates);
2091                   if (BE (err != REG_NOERROR, 0))
2092                     return err;
2093                 }
2094             }
2095         }
2096     }
2097   return REG_NOERROR;
2098 }
2099
2100 static reg_errcode_t
2101 sift_states_bkref (mctx, sctx, str_idx, candidates)
2102      re_match_context_t *mctx;
2103      re_sift_context_t *sctx;
2104      int str_idx;
2105      const re_node_set *candidates;
2106 {
2107   re_dfa_t *const dfa = mctx->dfa;
2108   reg_errcode_t err;
2109   int node_idx, node;
2110   re_sift_context_t local_sctx;
2111   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2112
2113   if (first_idx == -1)
2114     return REG_NOERROR;
2115
2116   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2117
2118   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2119     {
2120       int enabled_idx;
2121       re_token_type_t type;
2122       struct re_backref_cache_entry *entry;
2123       node = candidates->elems[node_idx];
2124       type = dfa->nodes[node].type;
2125       /* Avoid infinite loop for the REs like "()\1+".  */
2126       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2127         continue;
2128       if (type != OP_BACK_REF)
2129         continue;
2130
2131       entry = mctx->bkref_ents + first_idx;
2132       enabled_idx = first_idx;
2133       do
2134         {
2135           int subexp_len, to_idx, dst_node;
2136           re_dfastate_t *cur_state;
2137
2138           if (entry->node != node)
2139             continue;
2140           subexp_len = entry->subexp_to - entry->subexp_from;
2141           to_idx = str_idx + subexp_len;
2142           dst_node = (subexp_len ? dfa->nexts[node]
2143                       : dfa->edests[node].elems[0]);
2144
2145           if (to_idx > sctx->last_str_idx
2146               || sctx->sifted_states[to_idx] == NULL
2147               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2148               || check_dst_limits (mctx, &sctx->limits, node,
2149                                    str_idx, dst_node, to_idx))
2150             continue;
2151
2152           if (local_sctx.sifted_states == NULL)
2153             {
2154               local_sctx = *sctx;
2155               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2156               if (BE (err != REG_NOERROR, 0))
2157                 goto free_return;
2158             }
2159           local_sctx.last_node = node;
2160           local_sctx.last_str_idx = str_idx;
2161           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2162           if (BE (err < 0, 0))
2163             {
2164               err = REG_ESPACE;
2165               goto free_return;
2166             }
2167           cur_state = local_sctx.sifted_states[str_idx];
2168           err = sift_states_backward (mctx, &local_sctx);
2169           if (BE (err != REG_NOERROR, 0))
2170             goto free_return;
2171           if (sctx->limited_states != NULL)
2172             {
2173               err = merge_state_array (dfa, sctx->limited_states,
2174                                        local_sctx.sifted_states,
2175                                        str_idx + 1);
2176               if (BE (err != REG_NOERROR, 0))
2177                 goto free_return;
2178             }
2179           local_sctx.sifted_states[str_idx] = cur_state;
2180           re_node_set_remove (&local_sctx.limits, enabled_idx);
2181
2182           /* mctx->bkref_ents may have changed, reload the pointer.  */
2183           entry = mctx->bkref_ents + enabled_idx;
2184         }
2185       while (enabled_idx++, entry++->more);
2186     }
2187   err = REG_NOERROR;
2188  free_return:
2189   if (local_sctx.sifted_states != NULL)
2190     {
2191       re_node_set_free (&local_sctx.limits);
2192     }
2193
2194   return err;
2195 }
2196
2197
2198 #ifdef RE_ENABLE_I18N
2199 static int
2200 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2201     const re_match_context_t *mctx;
2202     re_sift_context_t *sctx;
2203     int node_idx, str_idx, max_str_idx;
2204 {
2205   re_dfa_t *const dfa = mctx->dfa;
2206   int naccepted;
2207   /* Check the node can accept `multi byte'.  */
2208   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2209   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2210       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2211                             dfa->nexts[node_idx]))
2212     /* The node can't accept the `multi byte', or the
2213        destination was already thrown away, then the node
2214        could't accept the current input `multi byte'.   */
2215     naccepted = 0;
2216   /* Otherwise, it is sure that the node could accept
2217      `naccepted' bytes input.  */
2218   return naccepted;
2219 }
2220 #endif /* RE_ENABLE_I18N */
2221
2222 \f
2223 /* Functions for state transition.  */
2224
2225 /* Return the next state to which the current state STATE will transit by
2226    accepting the current input byte, and update STATE_LOG if necessary.
2227    If STATE can accept a multibyte char/collating element/back reference
2228    update the destination of STATE_LOG.  */
2229
2230 static re_dfastate_t *
2231 transit_state (err, mctx, state)
2232      reg_errcode_t *err;
2233      re_match_context_t *mctx;
2234      re_dfastate_t *state;
2235 {
2236   re_dfastate_t **trtable;
2237   unsigned char ch;
2238
2239 #ifdef RE_ENABLE_I18N
2240   /* If the current state can accept multibyte.  */
2241   if (BE (state->accept_mb, 0))
2242     {
2243       *err = transit_state_mb (mctx, state);
2244       if (BE (*err != REG_NOERROR, 0))
2245         return NULL;
2246     }
2247 #endif /* RE_ENABLE_I18N */
2248
2249   /* Then decide the next state with the single byte.  */
2250 #if 0
2251   if (0)
2252     /* don't use transition table  */
2253     return transit_state_sb (err, mctx, state);
2254 #endif
2255
2256   /* Use transition table  */
2257   ch = re_string_fetch_byte (&mctx->input);
2258   for (;;)
2259     {
2260       trtable = state->trtable;
2261       if (BE (trtable != NULL, 1))
2262         return trtable[ch];
2263
2264       trtable = state->word_trtable;
2265       if (BE (trtable != NULL, 1))
2266         {
2267           unsigned int context;
2268           context
2269             = re_string_context_at (&mctx->input,
2270                                     re_string_cur_idx (&mctx->input) - 1,
2271                                     mctx->eflags);
2272           if (IS_WORD_CONTEXT (context))
2273             return trtable[ch + SBC_MAX];
2274           else
2275             return trtable[ch];
2276         }
2277
2278       if (!build_trtable (mctx->dfa, state))
2279         {
2280           *err = REG_ESPACE;
2281           return NULL;
2282         }
2283
2284       /* Retry, we now have a transition table.  */
2285     }
2286 }
2287
2288 /* Update the state_log if we need */
2289 re_dfastate_t *
2290 merge_state_with_log (err, mctx, next_state)
2291      reg_errcode_t *err;
2292      re_match_context_t *mctx;
2293      re_dfastate_t *next_state;
2294 {
2295   re_dfa_t *const dfa = mctx->dfa;
2296   int cur_idx = re_string_cur_idx (&mctx->input);
2297
2298   if (cur_idx > mctx->state_log_top)
2299     {
2300       mctx->state_log[cur_idx] = next_state;
2301       mctx->state_log_top = cur_idx;
2302     }
2303   else if (mctx->state_log[cur_idx] == 0)
2304     {
2305       mctx->state_log[cur_idx] = next_state;
2306     }
2307   else
2308     {
2309       re_dfastate_t *pstate;
2310       unsigned int context;
2311       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2312       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2313          the destination of a multibyte char/collating element/
2314          back reference.  Then the next state is the union set of
2315          these destinations and the results of the transition table.  */
2316       pstate = mctx->state_log[cur_idx];
2317       log_nodes = pstate->entrance_nodes;
2318       if (next_state != NULL)
2319         {
2320           table_nodes = next_state->entrance_nodes;
2321           *err = re_node_set_init_union (&next_nodes, table_nodes,
2322                                              log_nodes);
2323           if (BE (*err != REG_NOERROR, 0))
2324             return NULL;
2325         }
2326       else
2327         next_nodes = *log_nodes;
2328       /* Note: We already add the nodes of the initial state,
2329          then we don't need to add them here.  */
2330
2331       context = re_string_context_at (&mctx->input,
2332                                       re_string_cur_idx (&mctx->input) - 1,
2333                                       mctx->eflags);
2334       next_state = mctx->state_log[cur_idx]
2335         = re_acquire_state_context (err, dfa, &next_nodes, context);
2336       /* We don't need to check errors here, since the return value of
2337          this function is next_state and ERR is already set.  */
2338
2339       if (table_nodes != NULL)
2340         re_node_set_free (&next_nodes);
2341     }
2342
2343   if (BE (dfa->nbackref, 0) && next_state != NULL)
2344     {
2345       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2346          later.  We must check them here, since the back references in the
2347          next state might use them.  */
2348       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2349                                         cur_idx);
2350       if (BE (*err != REG_NOERROR, 0))
2351         return NULL;
2352
2353       /* If the next state has back references.  */
2354       if (next_state->has_backref)
2355         {
2356           *err = transit_state_bkref (mctx, &next_state->nodes);
2357           if (BE (*err != REG_NOERROR, 0))
2358             return NULL;
2359           next_state = mctx->state_log[cur_idx];
2360         }
2361     }
2362
2363   return next_state;
2364 }
2365
2366 /* Skip bytes in the input that correspond to part of a
2367    multi-byte match, then look in the log for a state
2368    from which to restart matching.  */
2369 re_dfastate_t *
2370 find_recover_state (err, mctx)
2371      reg_errcode_t *err;
2372      re_match_context_t *mctx;
2373 {
2374   re_dfastate_t *cur_state = NULL;
2375   do
2376     {
2377       int max = mctx->state_log_top;
2378       int cur_str_idx = re_string_cur_idx (&mctx->input);
2379
2380       do
2381         {
2382           if (++cur_str_idx > max)
2383             return NULL;
2384           re_string_skip_bytes (&mctx->input, 1);
2385         }
2386       while (mctx->state_log[cur_str_idx] == NULL);
2387
2388       cur_state = merge_state_with_log (err, mctx, NULL);
2389     }
2390   while (err == REG_NOERROR && cur_state == NULL);
2391   return cur_state;
2392 }
2393
2394 /* Helper functions for transit_state.  */
2395
2396 /* From the node set CUR_NODES, pick up the nodes whose types are
2397    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2398    expression. And register them to use them later for evaluating the
2399    correspoding back references.  */
2400
2401 static reg_errcode_t
2402 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2403      re_match_context_t *mctx;
2404      re_node_set *cur_nodes;
2405      int str_idx;
2406 {
2407   re_dfa_t *const dfa = mctx->dfa;
2408   int node_idx;
2409   reg_errcode_t err;
2410
2411   /* TODO: This isn't efficient.
2412            Because there might be more than one nodes whose types are
2413            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2414            nodes.
2415            E.g. RE: (a){2}  */
2416   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2417     {
2418       int node = cur_nodes->elems[node_idx];
2419       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2420           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2421           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2422         {
2423           err = match_ctx_add_subtop (mctx, node, str_idx);
2424           if (BE (err != REG_NOERROR, 0))
2425             return err;
2426         }
2427     }
2428   return REG_NOERROR;
2429 }
2430
2431 #if 0
2432 /* Return the next state to which the current state STATE will transit by
2433    accepting the current input byte.  */
2434
2435 static re_dfastate_t *
2436 transit_state_sb (err, mctx, state)
2437      reg_errcode_t *err;
2438      re_match_context_t *mctx;
2439      re_dfastate_t *state;
2440 {
2441   re_dfa_t *const dfa = mctx->dfa;
2442   re_node_set next_nodes;
2443   re_dfastate_t *next_state;
2444   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2445   unsigned int context;
2446
2447   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2448   if (BE (*err != REG_NOERROR, 0))
2449     return NULL;
2450   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2451     {
2452       int cur_node = state->nodes.elems[node_cnt];
2453       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2454         {
2455           *err = re_node_set_merge (&next_nodes,
2456                                     dfa->eclosures + dfa->nexts[cur_node]);
2457           if (BE (*err != REG_NOERROR, 0))
2458             {
2459               re_node_set_free (&next_nodes);
2460               return NULL;
2461             }
2462         }
2463     }
2464   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2465   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2466   /* We don't need to check errors here, since the return value of
2467      this function is next_state and ERR is already set.  */
2468
2469   re_node_set_free (&next_nodes);
2470   re_string_skip_bytes (&mctx->input, 1);
2471   return next_state;
2472 }
2473 #endif
2474
2475 #ifdef RE_ENABLE_I18N
2476 static reg_errcode_t
2477 transit_state_mb (mctx, pstate)
2478     re_match_context_t *mctx;
2479     re_dfastate_t *pstate;
2480 {
2481   re_dfa_t *const dfa = mctx->dfa;
2482   reg_errcode_t err;
2483   int i;
2484
2485   for (i = 0; i < pstate->nodes.nelem; ++i)
2486     {
2487       re_node_set dest_nodes, *new_nodes;
2488       int cur_node_idx = pstate->nodes.elems[i];
2489       int naccepted, dest_idx;
2490       unsigned int context;
2491       re_dfastate_t *dest_state;
2492
2493       if (!dfa->nodes[cur_node_idx].accept_mb)
2494         continue;
2495
2496       if (dfa->nodes[cur_node_idx].constraint)
2497         {
2498           context = re_string_context_at (&mctx->input,
2499                                           re_string_cur_idx (&mctx->input),
2500                                           mctx->eflags);
2501           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2502                                            context))
2503             continue;
2504         }
2505
2506       /* How many bytes the node can accept?  */
2507       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2508                                            re_string_cur_idx (&mctx->input));
2509       if (naccepted == 0)
2510         continue;
2511
2512       /* The node can accepts `naccepted' bytes.  */
2513       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2514       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2515                                : mctx->max_mb_elem_len);
2516       err = clean_state_log_if_needed (mctx, dest_idx);
2517       if (BE (err != REG_NOERROR, 0))
2518         return err;
2519 #ifdef DEBUG
2520       assert (dfa->nexts[cur_node_idx] != -1);
2521 #endif
2522       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2523
2524       dest_state = mctx->state_log[dest_idx];
2525       if (dest_state == NULL)
2526         dest_nodes = *new_nodes;
2527       else
2528         {
2529           err = re_node_set_init_union (&dest_nodes,
2530                                         dest_state->entrance_nodes, new_nodes);
2531           if (BE (err != REG_NOERROR, 0))
2532             return err;
2533         }
2534       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2535       mctx->state_log[dest_idx]
2536         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2537       if (dest_state != NULL)
2538         re_node_set_free (&dest_nodes);
2539       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2540         return err;
2541     }
2542   return REG_NOERROR;
2543 }
2544 #endif /* RE_ENABLE_I18N */
2545
2546 static reg_errcode_t
2547 transit_state_bkref (mctx, nodes)
2548     re_match_context_t *mctx;
2549     const re_node_set *nodes;
2550 {
2551   re_dfa_t *const dfa = mctx->dfa;
2552   reg_errcode_t err;
2553   int i;
2554   int cur_str_idx = re_string_cur_idx (&mctx->input);
2555
2556   for (i = 0; i < nodes->nelem; ++i)
2557     {
2558       int dest_str_idx, prev_nelem, bkc_idx;
2559       int node_idx = nodes->elems[i];
2560       unsigned int context;
2561       const re_token_t *node = dfa->nodes + node_idx;
2562       re_node_set *new_dest_nodes;
2563
2564       /* Check whether `node' is a backreference or not.  */
2565       if (node->type != OP_BACK_REF)
2566         continue;
2567
2568       if (node->constraint)
2569         {
2570           context = re_string_context_at (&mctx->input, cur_str_idx,
2571                                           mctx->eflags);
2572           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2573             continue;
2574         }
2575
2576       /* `node' is a backreference.
2577          Check the substring which the substring matched.  */
2578       bkc_idx = mctx->nbkref_ents;
2579       err = get_subexp (mctx, node_idx, cur_str_idx);
2580       if (BE (err != REG_NOERROR, 0))
2581         goto free_return;
2582
2583       /* And add the epsilon closures (which is `new_dest_nodes') of
2584          the backreference to appropriate state_log.  */
2585 #ifdef DEBUG
2586       assert (dfa->nexts[node_idx] != -1);
2587 #endif
2588       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2589         {
2590           int subexp_len;
2591           re_dfastate_t *dest_state;
2592           struct re_backref_cache_entry *bkref_ent;
2593           bkref_ent = mctx->bkref_ents + bkc_idx;
2594           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2595             continue;
2596           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2597           new_dest_nodes = (subexp_len == 0
2598                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2599                             : dfa->eclosures + dfa->nexts[node_idx]);
2600           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2601                           - bkref_ent->subexp_from);
2602           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2603                                           mctx->eflags);
2604           dest_state = mctx->state_log[dest_str_idx];
2605           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2606                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2607           /* Add `new_dest_node' to state_log.  */
2608           if (dest_state == NULL)
2609             {
2610               mctx->state_log[dest_str_idx]
2611                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2612                                             context);
2613               if (BE (mctx->state_log[dest_str_idx] == NULL
2614                       && err != REG_NOERROR, 0))
2615                 goto free_return;
2616             }
2617           else
2618             {
2619               re_node_set dest_nodes;
2620               err = re_node_set_init_union (&dest_nodes,
2621                                             dest_state->entrance_nodes,
2622                                             new_dest_nodes);
2623               if (BE (err != REG_NOERROR, 0))
2624                 {
2625                   re_node_set_free (&dest_nodes);
2626                   goto free_return;
2627                 }
2628               mctx->state_log[dest_str_idx]
2629                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2630               re_node_set_free (&dest_nodes);
2631               if (BE (mctx->state_log[dest_str_idx] == NULL
2632                       && err != REG_NOERROR, 0))
2633                 goto free_return;
2634             }
2635           /* We need to check recursively if the backreference can epsilon
2636              transit.  */
2637           if (subexp_len == 0
2638               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2639             {
2640               err = check_subexp_matching_top (mctx, new_dest_nodes,
2641                                                cur_str_idx);
2642               if (BE (err != REG_NOERROR, 0))
2643                 goto free_return;
2644               err = transit_state_bkref (mctx, new_dest_nodes);
2645               if (BE (err != REG_NOERROR, 0))
2646                 goto free_return;
2647             }
2648         }
2649     }
2650   err = REG_NOERROR;
2651  free_return:
2652   return err;
2653 }
2654
2655 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2656    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2657    Note that we might collect inappropriate candidates here.
2658    However, the cost of checking them strictly here is too high, then we
2659    delay these checking for prune_impossible_nodes().  */
2660
2661 static reg_errcode_t
2662 get_subexp (mctx, bkref_node, bkref_str_idx)
2663      re_match_context_t *mctx;
2664      int bkref_node, bkref_str_idx;
2665 {
2666   re_dfa_t *const dfa = mctx->dfa;
2667   int subexp_num, sub_top_idx;
2668   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2669   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2670   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2671   if (cache_idx != -1)
2672     {
2673       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2674       do
2675         if (entry->node == bkref_node)
2676           return REG_NOERROR; /* We already checked it.  */
2677       while (entry++->more);
2678     }
2679
2680   subexp_num = dfa->nodes[bkref_node].opr.idx;
2681
2682   /* For each sub expression  */
2683   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2684     {
2685       reg_errcode_t err;
2686       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2687       re_sub_match_last_t *sub_last;
2688       int sub_last_idx, sl_str, bkref_str_off;
2689
2690       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2691         continue; /* It isn't related.  */
2692
2693       sl_str = sub_top->str_idx;
2694       bkref_str_off = bkref_str_idx;
2695       /* At first, check the last node of sub expressions we already
2696          evaluated.  */
2697       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2698         {
2699           int sl_str_diff;
2700           sub_last = sub_top->lasts[sub_last_idx];
2701           sl_str_diff = sub_last->str_idx - sl_str;
2702           /* The matched string by the sub expression match with the substring
2703              at the back reference?  */
2704           if (sl_str_diff > 0)
2705             {
2706               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2707                 {
2708                   /* Not enough chars for a successful match.  */
2709                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2710                     break;
2711
2712                   err = clean_state_log_if_needed (mctx,
2713                                                    bkref_str_off
2714                                                    + sl_str_diff);
2715                   if (BE (err != REG_NOERROR, 0))
2716                     return err;
2717                   buf = (const char *) re_string_get_buffer (&mctx->input);
2718                 }
2719               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2720                 break; /* We don't need to search this sub expression any more.  */
2721             }
2722           bkref_str_off += sl_str_diff;
2723           sl_str += sl_str_diff;
2724           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2725                                 bkref_str_idx);
2726
2727           /* Reload buf, since the preceding call might have reallocated
2728              the buffer.  */
2729           buf = (const char *) re_string_get_buffer (&mctx->input);
2730
2731           if (err == REG_NOMATCH)
2732             continue;
2733           if (BE (err != REG_NOERROR, 0))
2734             return err;
2735         }
2736
2737       if (sub_last_idx < sub_top->nlasts)
2738         continue;
2739       if (sub_last_idx > 0)
2740         ++sl_str;
2741       /* Then, search for the other last nodes of the sub expression.  */
2742       for (; sl_str <= bkref_str_idx; ++sl_str)
2743         {
2744           int cls_node, sl_str_off;
2745           const re_node_set *nodes;
2746           sl_str_off = sl_str - sub_top->str_idx;
2747           /* The matched string by the sub expression match with the substring
2748              at the back reference?  */
2749           if (sl_str_off > 0)
2750             {
2751               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2752                 {
2753                   /* If we are at the end of the input, we cannot match.  */
2754                   if (bkref_str_off >= mctx->input.len)
2755                     break;
2756
2757                   err = extend_buffers (mctx);
2758                   if (BE (err != REG_NOERROR, 0))
2759                     return err;
2760
2761                   buf = (const char *) re_string_get_buffer (&mctx->input);
2762                 }
2763               if (buf [bkref_str_off++] != buf[sl_str - 1])
2764                 break; /* We don't need to search this sub expression
2765                           any more.  */
2766             }
2767           if (mctx->state_log[sl_str] == NULL)
2768             continue;
2769           /* Does this state have a ')' of the sub expression?  */
2770           nodes = &mctx->state_log[sl_str]->nodes;
2771           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2772           if (cls_node == -1)
2773             continue; /* No.  */
2774           if (sub_top->path == NULL)
2775             {
2776               sub_top->path = calloc (sizeof (state_array_t),
2777                                       sl_str - sub_top->str_idx + 1);
2778               if (sub_top->path == NULL)
2779                 return REG_ESPACE;
2780             }
2781           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2782              in the current context?  */
2783           err = check_arrival (mctx, sub_top->path, sub_top->node,
2784                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2785           if (err == REG_NOMATCH)
2786               continue;
2787           if (BE (err != REG_NOERROR, 0))
2788               return err;
2789           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2790           if (BE (sub_last == NULL, 0))
2791             return REG_ESPACE;
2792           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2793                                 bkref_str_idx);
2794           if (err == REG_NOMATCH)
2795             continue;
2796         }
2797     }
2798   return REG_NOERROR;
2799 }
2800
2801 /* Helper functions for get_subexp().  */
2802
2803 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2804    If it can arrive, register the sub expression expressed with SUB_TOP
2805    and SUB_LAST.  */
2806
2807 static reg_errcode_t
2808 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2809      re_match_context_t *mctx;
2810      const re_sub_match_top_t *sub_top;
2811      re_sub_match_last_t *sub_last;
2812      int bkref_node, bkref_str;
2813 {
2814   reg_errcode_t err;
2815   int to_idx;
2816   /* Can the subexpression arrive the back reference?  */
2817   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2818                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2819   if (err != REG_NOERROR)
2820     return err;
2821   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2822                              sub_last->str_idx);
2823   if (BE (err != REG_NOERROR, 0))
2824     return err;
2825   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2826   return clean_state_log_if_needed (mctx, to_idx);
2827 }
2828
2829 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2830    Search '(' if FL_OPEN, or search ')' otherwise.
2831    TODO: This function isn't efficient...
2832          Because there might be more than one nodes whose types are
2833          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2834          nodes.
2835          E.g. RE: (a){2}  */
2836
2837 static int
2838 find_subexp_node (dfa, nodes, subexp_idx, type)
2839      const re_dfa_t *dfa;
2840      const re_node_set *nodes;
2841      int subexp_idx, type;
2842 {
2843   int cls_idx;
2844   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2845     {
2846       int cls_node = nodes->elems[cls_idx];
2847       const re_token_t *node = dfa->nodes + cls_node;
2848       if (node->type == type
2849           && node->opr.idx == subexp_idx)
2850         return cls_node;
2851     }
2852   return -1;
2853 }
2854
2855 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2856    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2857    heavily reused.
2858    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2859
2860 static reg_errcode_t
2861 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2862                type)
2863      re_match_context_t *mctx;
2864      state_array_t *path;
2865      int top_node, top_str, last_node, last_str, type;
2866 {
2867   re_dfa_t *const dfa = mctx->dfa;
2868   reg_errcode_t err;
2869   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2870   re_dfastate_t *cur_state = NULL;
2871   re_node_set *cur_nodes, next_nodes;
2872   re_dfastate_t **backup_state_log;
2873   unsigned int context;
2874
2875   subexp_num = dfa->nodes[top_node].opr.idx;
2876   /* Extend the buffer if we need.  */
2877   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2878     {
2879       re_dfastate_t **new_array;
2880       int old_alloc = path->alloc;
2881       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2882       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2883       if (new_array == NULL)
2884         {
2885           path->alloc = old_alloc;
2886           return REG_ESPACE;
2887         }
2888       path->array = new_array;
2889       memset (new_array + old_alloc, '\0',
2890               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2891     }
2892
2893   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2894
2895   /* Temporary modify MCTX.  */
2896   backup_state_log = mctx->state_log;
2897   backup_cur_idx = mctx->input.cur_idx;
2898   mctx->state_log = path->array;
2899   mctx->input.cur_idx = str_idx;
2900
2901   /* Setup initial node set.  */
2902   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2903   if (str_idx == top_str)
2904     {
2905       err = re_node_set_init_1 (&next_nodes, top_node);
2906       if (BE (err != REG_NOERROR, 0))
2907         return err;
2908       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2909       if (BE (err != REG_NOERROR, 0))
2910         {
2911           re_node_set_free (&next_nodes);
2912           return err;
2913         }
2914     }
2915   else
2916     {
2917       cur_state = mctx->state_log[str_idx];
2918       if (cur_state && cur_state->has_backref)
2919         {
2920           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2921           if (BE ( err != REG_NOERROR, 0))
2922             return err;
2923         }
2924       else
2925         re_node_set_init_empty (&next_nodes);
2926     }
2927   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2928     {
2929       if (next_nodes.nelem)
2930         {
2931           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2932                                     subexp_num, type);
2933           if (BE ( err != REG_NOERROR, 0))
2934             {
2935               re_node_set_free (&next_nodes);
2936               return err;
2937             }
2938         }
2939       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2940       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2941         {
2942           re_node_set_free (&next_nodes);
2943           return err;
2944         }
2945       mctx->state_log[str_idx] = cur_state;
2946     }
2947
2948   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2949     {
2950       re_node_set_empty (&next_nodes);
2951       if (mctx->state_log[str_idx + 1])
2952         {
2953           err = re_node_set_merge (&next_nodes,
2954                                    &mctx->state_log[str_idx + 1]->nodes);
2955           if (BE (err != REG_NOERROR, 0))
2956             {
2957               re_node_set_free (&next_nodes);
2958               return err;
2959             }
2960         }
2961       if (cur_state)
2962         {
2963           err = check_arrival_add_next_nodes (mctx, str_idx,
2964                                               &cur_state->non_eps_nodes, &next_nodes);
2965           if (BE (err != REG_NOERROR, 0))
2966             {
2967               re_node_set_free (&next_nodes);
2968               return err;
2969             }
2970         }
2971       ++str_idx;
2972       if (next_nodes.nelem)
2973         {
2974           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2975           if (BE (err != REG_NOERROR, 0))
2976             {
2977               re_node_set_free (&next_nodes);
2978               return err;
2979             }
2980           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2981                                     subexp_num, type);
2982           if (BE ( err != REG_NOERROR, 0))
2983             {
2984               re_node_set_free (&next_nodes);
2985               return err;
2986             }
2987         }
2988       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2989       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2990       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2991         {
2992           re_node_set_free (&next_nodes);
2993           return err;
2994         }
2995       mctx->state_log[str_idx] = cur_state;
2996       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2997     }
2998   re_node_set_free (&next_nodes);
2999   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3000                : &mctx->state_log[last_str]->nodes);
3001   path->next_idx = str_idx;
3002
3003   /* Fix MCTX.  */
3004   mctx->state_log = backup_state_log;
3005   mctx->input.cur_idx = backup_cur_idx;
3006
3007   /* Then check the current node set has the node LAST_NODE.  */
3008   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3009     return REG_NOERROR;
3010
3011   return REG_NOMATCH;
3012 }
3013
3014 /* Helper functions for check_arrival.  */
3015
3016 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3017    to NEXT_NODES.
3018    TODO: This function is similar to the functions transit_state*(),
3019          however this function has many additional works.
3020          Can't we unify them?  */
3021
3022 static reg_errcode_t
3023 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3024      re_match_context_t *mctx;
3025      int str_idx;
3026      re_node_set *cur_nodes, *next_nodes;
3027 {
3028   re_dfa_t *const dfa = mctx->dfa;
3029   int result;
3030   int cur_idx;
3031   reg_errcode_t err;
3032   re_node_set union_set;
3033   re_node_set_init_empty (&union_set);
3034   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3035     {
3036       int naccepted = 0;
3037       int cur_node = cur_nodes->elems[cur_idx];
3038 #ifdef DEBUG
3039       re_token_type_t type = dfa->nodes[cur_node].type;
3040       assert (!IS_EPSILON_NODE (type));
3041 #endif
3042 #ifdef RE_ENABLE_I18N
3043       /* If the node may accept `multi byte'.  */
3044       if (dfa->nodes[cur_node].accept_mb)
3045         {
3046           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3047                                                str_idx);
3048           if (naccepted > 1)
3049             {
3050               re_dfastate_t *dest_state;
3051               int next_node = dfa->nexts[cur_node];
3052               int next_idx = str_idx + naccepted;
3053               dest_state = mctx->state_log[next_idx];
3054               re_node_set_empty (&union_set);
3055               if (dest_state)
3056                 {
3057                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3058                   if (BE (err != REG_NOERROR, 0))
3059                     {
3060                       re_node_set_free (&union_set);
3061                       return err;
3062                     }
3063                 }
3064               result = re_node_set_insert (&union_set, next_node);
3065               if (BE (result < 0, 0))
3066                 {
3067                   re_node_set_free (&union_set);
3068                   return REG_ESPACE;
3069                 }
3070               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3071                                                             &union_set);
3072               if (BE (mctx->state_log[next_idx] == NULL
3073                       && err != REG_NOERROR, 0))
3074                 {
3075                   re_node_set_free (&union_set);
3076                   return err;
3077                 }
3078             }
3079         }
3080 #endif /* RE_ENABLE_I18N */
3081       if (naccepted
3082           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3083         {
3084           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3085           if (BE (result < 0, 0))
3086             {
3087               re_node_set_free (&union_set);
3088               return REG_ESPACE;
3089             }
3090         }
3091     }
3092   re_node_set_free (&union_set);
3093   return REG_NOERROR;
3094 }
3095
3096 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3097    CUR_NODES, however exclude the nodes which are:
3098     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3099     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3100 */
3101
3102 static reg_errcode_t
3103 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3104      re_dfa_t *dfa;
3105      re_node_set *cur_nodes;
3106      int ex_subexp, type;
3107 {
3108   reg_errcode_t err;
3109   int idx, outside_node;
3110   re_node_set new_nodes;
3111 #ifdef DEBUG
3112   assert (cur_nodes->nelem);
3113 #endif
3114   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3115   if (BE (err != REG_NOERROR, 0))
3116     return err;
3117   /* Create a new node set NEW_NODES with the nodes which are epsilon
3118      closures of the node in CUR_NODES.  */
3119
3120   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3121     {
3122       int cur_node = cur_nodes->elems[idx];
3123       re_node_set *eclosure = dfa->eclosures + cur_node;
3124       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3125       if (outside_node == -1)
3126         {
3127           /* There are no problematic nodes, just merge them.  */
3128           err = re_node_set_merge (&new_nodes, eclosure);
3129           if (BE (err != REG_NOERROR, 0))
3130             {
3131               re_node_set_free (&new_nodes);
3132               return err;
3133             }
3134         }
3135       else
3136         {
3137           /* There are problematic nodes, re-calculate incrementally.  */
3138           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3139                                               ex_subexp, type);
3140           if (BE (err != REG_NOERROR, 0))
3141             {
3142               re_node_set_free (&new_nodes);
3143               return err;
3144             }
3145         }
3146     }
3147   re_node_set_free (cur_nodes);
3148   *cur_nodes = new_nodes;
3149   return REG_NOERROR;
3150 }
3151
3152 /* Helper function for check_arrival_expand_ecl.
3153    Check incrementally the epsilon closure of TARGET, and if it isn't
3154    problematic append it to DST_NODES.  */
3155
3156 static reg_errcode_t
3157 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3158      re_dfa_t *dfa;
3159      int target, ex_subexp, type;
3160      re_node_set *dst_nodes;
3161 {
3162   int cur_node;
3163   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3164     {
3165       int err;
3166
3167       if (dfa->nodes[cur_node].type == type
3168           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3169         {
3170           if (type == OP_CLOSE_SUBEXP)
3171             {
3172               err = re_node_set_insert (dst_nodes, cur_node);
3173               if (BE (err == -1, 0))
3174                 return REG_ESPACE;
3175             }
3176           break;
3177         }
3178       err = re_node_set_insert (dst_nodes, cur_node);
3179       if (BE (err == -1, 0))
3180         return REG_ESPACE;
3181       if (dfa->edests[cur_node].nelem == 0)
3182         break;
3183       if (dfa->edests[cur_node].nelem == 2)
3184         {
3185           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3186                                               dfa->edests[cur_node].elems[1],
3187                                               ex_subexp, type);
3188           if (BE (err != REG_NOERROR, 0))
3189             return err;
3190         }
3191       cur_node = dfa->edests[cur_node].elems[0];
3192     }
3193   return REG_NOERROR;
3194 }
3195
3196
3197 /* For all the back references in the current state, calculate the
3198    destination of the back references by the appropriate entry
3199    in MCTX->BKREF_ENTS.  */
3200
3201 static reg_errcode_t
3202 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3203                     type)
3204      re_match_context_t *mctx;
3205      int cur_str, subexp_num, type;
3206      re_node_set *cur_nodes;
3207 {
3208   re_dfa_t *const dfa = mctx->dfa;
3209   reg_errcode_t err;
3210   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3211   struct re_backref_cache_entry *ent;
3212
3213   if (cache_idx_start == -1)
3214     return REG_NOERROR;
3215
3216  restart:
3217   ent = mctx->bkref_ents + cache_idx_start;
3218   do
3219     {
3220       int to_idx, next_node;
3221
3222       /* Is this entry ENT is appropriate?  */
3223       if (!re_node_set_contains (cur_nodes, ent->node))
3224         continue; /* No.  */
3225
3226       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3227       /* Calculate the destination of the back reference, and append it
3228          to MCTX->STATE_LOG.  */
3229       if (to_idx == cur_str)
3230         {
3231           /* The backreference did epsilon transit, we must re-check all the
3232              node in the current state.  */
3233           re_node_set new_dests;
3234           reg_errcode_t err2, err3;
3235           next_node = dfa->edests[ent->node].elems[0];
3236           if (re_node_set_contains (cur_nodes, next_node))
3237             continue;
3238           err = re_node_set_init_1 (&new_dests, next_node);
3239           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3240           err3 = re_node_set_merge (cur_nodes, &new_dests);
3241           re_node_set_free (&new_dests);
3242           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3243                   || err3 != REG_NOERROR, 0))
3244             {
3245               err = (err != REG_NOERROR ? err
3246                      : (err2 != REG_NOERROR ? err2 : err3));
3247               return err;
3248             }
3249           /* TODO: It is still inefficient...  */
3250           goto restart;
3251         }
3252       else
3253         {
3254           re_node_set union_set;
3255           next_node = dfa->nexts[ent->node];
3256           if (mctx->state_log[to_idx])
3257             {
3258               int ret;
3259               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3260                                         next_node))
3261                 continue;
3262               err = re_node_set_init_copy (&union_set,
3263                                            &mctx->state_log[to_idx]->nodes);
3264               ret = re_node_set_insert (&union_set, next_node);
3265               if (BE (err != REG_NOERROR || ret < 0, 0))
3266                 {
3267                   re_node_set_free (&union_set);
3268                   err = err != REG_NOERROR ? err : REG_ESPACE;
3269                   return err;
3270                 }
3271             }
3272           else
3273             {
3274               err = re_node_set_init_1 (&union_set, next_node);
3275               if (BE (err != REG_NOERROR, 0))
3276                 return err;
3277             }
3278           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3279           re_node_set_free (&union_set);
3280           if (BE (mctx->state_log[to_idx] == NULL
3281                   && err != REG_NOERROR, 0))
3282             return err;
3283         }
3284     }
3285   while (ent++->more);
3286   return REG_NOERROR;
3287 }
3288
3289 /* Build transition table for the state.
3290    Return 1 if succeeded, otherwise return NULL.  */
3291
3292 static int
3293 build_trtable (dfa, state)
3294     re_dfa_t *dfa;
3295     re_dfastate_t *state;
3296 {
3297   reg_errcode_t err;
3298   int i, j, ch, need_word_trtable = 0;
3299   unsigned int elem, mask;
3300   int dests_node_malloced = 0, dest_states_malloced = 0;
3301   int ndests; /* Number of the destination states from `state'.  */
3302   re_dfastate_t **trtable;
3303   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3304   re_node_set follows, *dests_node;
3305   bitset *dests_ch;
3306   bitset acceptable;
3307
3308   /* We build DFA states which corresponds to the destination nodes
3309      from `state'.  `dests_node[i]' represents the nodes which i-th
3310      destination state contains, and `dests_ch[i]' represents the
3311      characters which i-th destination state accepts.  */
3312 #ifdef _LIBC
3313   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3314     dests_node = (re_node_set *)
3315       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3316   else
3317 #endif
3318     {
3319       dests_node = (re_node_set *)
3320         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3321       if (BE (dests_node == NULL, 0))
3322         return 0;
3323       dests_node_malloced = 1;
3324     }
3325   dests_ch = (bitset *) (dests_node + SBC_MAX);
3326
3327   /* Initialize transiton table.  */
3328   state->word_trtable = state->trtable = NULL;
3329
3330   /* At first, group all nodes belonging to `state' into several
3331      destinations.  */
3332   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3333   if (BE (ndests <= 0, 0))
3334     {
3335       if (dests_node_malloced)
3336         free (dests_node);
3337       /* Return 0 in case of an error, 1 otherwise.  */
3338       if (ndests == 0)
3339         {
3340           state->trtable = (re_dfastate_t **)
3341             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3342           return 1;
3343         }
3344       return 0;
3345     }
3346
3347   err = re_node_set_alloc (&follows, ndests + 1);
3348   if (BE (err != REG_NOERROR, 0))
3349     goto out_free;
3350
3351 #ifdef _LIBC
3352   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3353                          + ndests * 3 * sizeof (re_dfastate_t *)))
3354     dest_states = (re_dfastate_t **)
3355       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3356   else
3357 #endif
3358     {
3359       dest_states = (re_dfastate_t **)
3360         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3361       if (BE (dest_states == NULL, 0))
3362         {
3363 out_free:
3364           if (dest_states_malloced)
3365             free (dest_states);
3366           re_node_set_free (&follows);
3367           for (i = 0; i < ndests; ++i)
3368             re_node_set_free (dests_node + i);
3369           if (dests_node_malloced)
3370             free (dests_node);
3371           return 0;
3372         }
3373       dest_states_malloced = 1;
3374     }
3375   dest_states_word = dest_states + ndests;
3376   dest_states_nl = dest_states_word + ndests;
3377   bitset_empty (acceptable);
3378
3379   /* Then build the states for all destinations.  */
3380   for (i = 0; i < ndests; ++i)
3381     {
3382       int next_node;
3383       re_node_set_empty (&follows);
3384       /* Merge the follows of this destination states.  */
3385       for (j = 0; j < dests_node[i].nelem; ++j)
3386         {
3387           next_node = dfa->nexts[dests_node[i].elems[j]];
3388           if (next_node != -1)
3389             {
3390               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3391               if (BE (err != REG_NOERROR, 0))
3392                 goto out_free;
3393             }
3394         }
3395       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3396       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3397         goto out_free;
3398       /* If the new state has context constraint,
3399          build appropriate states for these contexts.  */
3400       if (dest_states[i]->has_constraint)
3401         {
3402           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3403                                                           CONTEXT_WORD);
3404           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3405             goto out_free;
3406
3407           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3408             need_word_trtable = 1;
3409
3410           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3411                                                         CONTEXT_NEWLINE);
3412           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3413             goto out_free;
3414         }
3415       else
3416         {
3417           dest_states_word[i] = dest_states[i];
3418           dest_states_nl[i] = dest_states[i];
3419         }
3420       bitset_merge (acceptable, dests_ch[i]);
3421     }
3422
3423   if (!BE (need_word_trtable, 0))
3424     {
3425       /* We don't care about whether the following character is a word
3426          character, or we are in a single-byte character set so we can
3427          discern by looking at the character code: allocate a
3428          256-entry transition table.  */
3429       trtable = state->trtable =
3430         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3431       if (BE (trtable == NULL, 0))
3432         goto out_free;
3433
3434       /* For all characters ch...:  */
3435       for (i = 0; i < BITSET_UINTS; ++i)
3436         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3437              elem;
3438              mask <<= 1, elem >>= 1, ++ch)
3439           if (BE (elem & 1, 0))
3440             {
3441               /* There must be exactly one destination which accepts
3442                  character ch.  See group_nodes_into_DFAstates.  */
3443               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3444                 ;
3445
3446               /* j-th destination accepts the word character ch.  */
3447               if (dfa->word_char[i] & mask)
3448                 trtable[ch] = dest_states_word[j];
3449               else
3450                 trtable[ch] = dest_states[j];
3451             }
3452     }
3453   else
3454     {
3455       /* We care about whether the following character is a word
3456          character, and we are in a multi-byte character set: discern
3457          by looking at the character code: build two 256-entry
3458          transition tables, one starting at trtable[0] and one
3459          starting at trtable[SBC_MAX].  */
3460       trtable = state->word_trtable =
3461         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3462       if (BE (trtable == NULL, 0))
3463         goto out_free;
3464
3465       /* For all characters ch...:  */
3466       for (i = 0; i < BITSET_UINTS; ++i)
3467         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3468              elem;
3469              mask <<= 1, elem >>= 1, ++ch)
3470           if (BE (elem & 1, 0))
3471             {
3472               /* There must be exactly one destination which accepts
3473                  character ch.  See group_nodes_into_DFAstates.  */
3474               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3475                 ;
3476
3477               /* j-th destination accepts the word character ch.  */
3478               trtable[ch] = dest_states[j];
3479               trtable[ch + SBC_MAX] = dest_states_word[j];
3480             }
3481     }
3482
3483   /* new line */
3484   if (bitset_contain (acceptable, NEWLINE_CHAR))
3485     {
3486       /* The current state accepts newline character.  */
3487       for (j = 0; j < ndests; ++j)
3488         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3489           {
3490             /* k-th destination accepts newline character.  */
3491             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3492             if (need_word_trtable)
3493               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3494             /* There must be only one destination which accepts
3495                newline.  See group_nodes_into_DFAstates.  */
3496             break;
3497           }
3498     }
3499
3500   if (dest_states_malloced)
3501     free (dest_states);
3502
3503   re_node_set_free (&follows);
3504   for (i = 0; i < ndests; ++i)
3505     re_node_set_free (dests_node + i);
3506
3507   if (dests_node_malloced)
3508     free (dests_node);
3509
3510   return 1;
3511 }
3512
3513 /* Group all nodes belonging to STATE into several destinations.
3514    Then for all destinations, set the nodes belonging to the destination
3515    to DESTS_NODE[i] and set the characters accepted by the destination
3516    to DEST_CH[i].  This function return the number of destinations.  */
3517
3518 static int
3519 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3520     re_dfa_t *dfa;
3521     const re_dfastate_t *state;
3522     re_node_set *dests_node;
3523     bitset *dests_ch;
3524 {
3525   reg_errcode_t err;
3526   int result;
3527   int i, j, k;
3528   int ndests; /* Number of the destinations from `state'.  */
3529   bitset accepts; /* Characters a node can accept.  */
3530   const re_node_set *cur_nodes = &state->nodes;
3531   bitset_empty (accepts);
3532   ndests = 0;
3533
3534   /* For all the nodes belonging to `state',  */
3535   for (i = 0; i < cur_nodes->nelem; ++i)
3536     {
3537       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3538       re_token_type_t type = node->type;
3539       unsigned int constraint = node->constraint;
3540
3541       /* Enumerate all single byte character this node can accept.  */
3542       if (type == CHARACTER)
3543         bitset_set (accepts, node->opr.c);
3544       else if (type == SIMPLE_BRACKET)
3545         {
3546           bitset_merge (accepts, node->opr.sbcset);
3547         }
3548       else if (type == OP_PERIOD)
3549         {
3550 #ifdef RE_ENABLE_I18N
3551           if (dfa->mb_cur_max > 1)
3552             bitset_merge (accepts, dfa->sb_char);
3553           else
3554 #endif
3555             bitset_set_all (accepts);
3556           if (!(dfa->syntax & RE_DOT_NEWLINE))
3557             bitset_clear (accepts, '\n');
3558           if (dfa->syntax & RE_DOT_NOT_NULL)
3559             bitset_clear (accepts, '\0');
3560         }
3561 #ifdef RE_ENABLE_I18N
3562       else if (type == OP_UTF8_PERIOD)
3563         {
3564           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3565           if (!(dfa->syntax & RE_DOT_NEWLINE))
3566             bitset_clear (accepts, '\n');
3567           if (dfa->syntax & RE_DOT_NOT_NULL)
3568             bitset_clear (accepts, '\0');
3569         }
3570 #endif
3571       else
3572         continue;
3573
3574       /* Check the `accepts' and sift the characters which are not
3575          match it the context.  */
3576       if (constraint)
3577         {
3578           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3579             {
3580               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3581               bitset_empty (accepts);
3582               if (accepts_newline)
3583                 bitset_set (accepts, NEWLINE_CHAR);
3584               else
3585                 continue;
3586             }
3587           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3588             {
3589               bitset_empty (accepts);
3590               continue;
3591             }
3592
3593           if (constraint & NEXT_WORD_CONSTRAINT)
3594             {
3595               unsigned int any_set = 0;
3596               if (type == CHARACTER && !node->word_char)
3597                 {
3598                   bitset_empty (accepts);
3599                   continue;
3600                 }
3601 #ifdef RE_ENABLE_I18N
3602               if (dfa->mb_cur_max > 1)
3603                 for (j = 0; j < BITSET_UINTS; ++j)
3604                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3605               else
3606 #endif
3607                 for (j = 0; j < BITSET_UINTS; ++j)
3608                   any_set |= (accepts[j] &= dfa->word_char[j]);
3609               if (!any_set)
3610                 continue;
3611             }
3612           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3613             {
3614               unsigned int any_set = 0;
3615               if (type == CHARACTER && node->word_char)
3616                 {
3617                   bitset_empty (accepts);
3618                   continue;
3619                 }
3620 #ifdef RE_ENABLE_I18N
3621               if (dfa->mb_cur_max > 1)
3622                 for (j = 0; j < BITSET_UINTS; ++j)
3623                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3624               else
3625 #endif
3626                 for (j = 0; j < BITSET_UINTS; ++j)
3627                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3628               if (!any_set)
3629                 continue;
3630             }
3631         }
3632
3633       /* Then divide `accepts' into DFA states, or create a new
3634          state.  Above, we make sure that accepts is not empty.  */
3635       for (j = 0; j < ndests; ++j)
3636         {
3637           bitset intersec; /* Intersection sets, see below.  */
3638           bitset remains;
3639           /* Flags, see below.  */
3640           int has_intersec, not_subset, not_consumed;
3641
3642           /* Optimization, skip if this state doesn't accept the character.  */
3643           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3644             continue;
3645
3646           /* Enumerate the intersection set of this state and `accepts'.  */
3647           has_intersec = 0;
3648           for (k = 0; k < BITSET_UINTS; ++k)
3649             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3650           /* And skip if the intersection set is empty.  */
3651           if (!has_intersec)
3652             continue;
3653
3654           /* Then check if this state is a subset of `accepts'.  */
3655           not_subset = not_consumed = 0;
3656           for (k = 0; k < BITSET_UINTS; ++k)
3657             {
3658               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3659               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3660             }
3661
3662           /* If this state isn't a subset of `accepts', create a
3663              new group state, which has the `remains'. */
3664           if (not_subset)
3665             {
3666               bitset_copy (dests_ch[ndests], remains);
3667               bitset_copy (dests_ch[j], intersec);
3668               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3669               if (BE (err != REG_NOERROR, 0))
3670                 goto error_return;
3671               ++ndests;
3672             }
3673
3674           /* Put the position in the current group. */
3675           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3676           if (BE (result < 0, 0))
3677             goto error_return;
3678
3679           /* If all characters are consumed, go to next node. */
3680           if (!not_consumed)
3681             break;
3682         }
3683       /* Some characters remain, create a new group. */
3684       if (j == ndests)
3685         {
3686           bitset_copy (dests_ch[ndests], accepts);
3687           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3688           if (BE (err != REG_NOERROR, 0))
3689             goto error_return;
3690           ++ndests;
3691           bitset_empty (accepts);
3692         }
3693     }
3694   return ndests;
3695  error_return:
3696   for (j = 0; j < ndests; ++j)
3697     re_node_set_free (dests_node + j);
3698   return -1;
3699 }
3700
3701 #ifdef RE_ENABLE_I18N
3702 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3703    Return the number of the bytes the node accepts.
3704    STR_IDX is the current index of the input string.
3705
3706    This function handles the nodes which can accept one character, or
3707    one collating element like '.', '[a-z]', opposite to the other nodes
3708    can only accept one byte.  */
3709
3710 static int
3711 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3712     re_dfa_t *dfa;
3713     int node_idx, str_idx;
3714     const re_string_t *input;
3715 {
3716   const re_token_t *node = dfa->nodes + node_idx;
3717   int char_len, elem_len;
3718   int i;
3719
3720   if (BE (node->type == OP_UTF8_PERIOD, 0))
3721     {
3722       unsigned char c = re_string_byte_at (input, str_idx), d;
3723       if (BE (c < 0xc2, 1))
3724         return 0;
3725
3726       if (str_idx + 2 > input->len)
3727         return 0;
3728
3729       d = re_string_byte_at (input, str_idx + 1);
3730       if (c < 0xe0)
3731         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3732       else if (c < 0xf0)
3733         {
3734           char_len = 3;
3735           if (c == 0xe0 && d < 0xa0)
3736             return 0;
3737         }
3738       else if (c < 0xf8)
3739         {
3740           char_len = 4;
3741           if (c == 0xf0 && d < 0x90)
3742             return 0;
3743         }
3744       else if (c < 0xfc)
3745         {
3746           char_len = 5;
3747           if (c == 0xf8 && d < 0x88)
3748             return 0;
3749         }
3750       else if (c < 0xfe)
3751         {
3752           char_len = 6;
3753           if (c == 0xfc && d < 0x84)
3754             return 0;
3755         }
3756       else
3757         return 0;
3758
3759       if (str_idx + char_len > input->len)
3760         return 0;
3761
3762       for (i = 1; i < char_len; ++i)
3763         {
3764           d = re_string_byte_at (input, str_idx + i);
3765           if (d < 0x80 || d > 0xbf)
3766             return 0;
3767         }
3768       return char_len;
3769     }
3770
3771   char_len = re_string_char_size_at (input, str_idx);
3772   if (node->type == OP_PERIOD)
3773     {
3774       if (char_len <= 1)
3775         return 0;
3776       /* FIXME: I don't think this if is needed, as both '\n'
3777          and '\0' are char_len == 1.  */
3778       /* '.' accepts any one character except the following two cases.  */
3779       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3780            re_string_byte_at (input, str_idx) == '\n') ||
3781           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3782            re_string_byte_at (input, str_idx) == '\0'))
3783         return 0;
3784       return char_len;
3785     }
3786
3787   elem_len = re_string_elem_size_at (input, str_idx);
3788   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3789     return 0;
3790
3791   if (node->type == COMPLEX_BRACKET)
3792     {
3793       const re_charset_t *cset = node->opr.mbcset;
3794 # ifdef _LIBC
3795       const unsigned char *pin
3796         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3797       int j;
3798       uint32_t nrules;
3799 # endif /* _LIBC */
3800       int match_len = 0;
3801       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3802                     ? re_string_wchar_at (input, str_idx) : 0);
3803
3804       /* match with multibyte character?  */
3805       for (i = 0; i < cset->nmbchars; ++i)
3806         if (wc == cset->mbchars[i])
3807           {
3808             match_len = char_len;
3809             goto check_node_accept_bytes_match;
3810           }
3811       /* match with character_class?  */
3812       for (i = 0; i < cset->nchar_classes; ++i)
3813         {
3814           wctype_t wt = cset->char_classes[i];
3815           if (__iswctype (wc, wt))
3816             {
3817               match_len = char_len;
3818               goto check_node_accept_bytes_match;
3819             }
3820         }
3821
3822 # ifdef _LIBC
3823       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3824       if (nrules != 0)
3825         {
3826           unsigned int in_collseq = 0;
3827           const int32_t *table, *indirect;
3828           const unsigned char *weights, *extra;
3829           const char *collseqwc;
3830           int32_t idx;
3831           /* This #include defines a local function!  */
3832 #  include <locale/weight.h>
3833
3834           /* match with collating_symbol?  */
3835           if (cset->ncoll_syms)
3836             extra = (const unsigned char *)
3837               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3838           for (i = 0; i < cset->ncoll_syms; ++i)
3839             {
3840               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3841               /* Compare the length of input collating element and
3842                  the length of current collating element.  */
3843               if (*coll_sym != elem_len)
3844                 continue;
3845               /* Compare each bytes.  */
3846               for (j = 0; j < *coll_sym; j++)
3847                 if (pin[j] != coll_sym[1 + j])
3848                   break;
3849               if (j == *coll_sym)
3850                 {
3851                   /* Match if every bytes is equal.  */
3852                   match_len = j;
3853                   goto check_node_accept_bytes_match;
3854                 }
3855             }
3856
3857           if (cset->nranges)
3858             {
3859               if (elem_len <= char_len)
3860                 {
3861                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3862                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3863                 }
3864               else
3865                 in_collseq = find_collation_sequence_value (pin, elem_len);
3866             }
3867           /* match with range expression?  */
3868           for (i = 0; i < cset->nranges; ++i)
3869             if (cset->range_starts[i] <= in_collseq
3870                 && in_collseq <= cset->range_ends[i])
3871               {
3872                 match_len = elem_len;
3873                 goto check_node_accept_bytes_match;
3874               }
3875
3876           /* match with equivalence_class?  */
3877           if (cset->nequiv_classes)
3878             {
3879               const unsigned char *cp = pin;
3880               table = (const int32_t *)
3881                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3882               weights = (const unsigned char *)
3883                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3884               extra = (const unsigned char *)
3885                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3886               indirect = (const int32_t *)
3887                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3888               idx = findidx (&cp);
3889               if (idx > 0)
3890                 for (i = 0; i < cset->nequiv_classes; ++i)
3891                   {
3892                     int32_t equiv_class_idx = cset->equiv_classes[i];
3893                     size_t weight_len = weights[idx];
3894                     if (weight_len == weights[equiv_class_idx])
3895                       {
3896                         int cnt = 0;
3897                         while (cnt <= weight_len
3898                                && (weights[equiv_class_idx + 1 + cnt]
3899                                    == weights[idx + 1 + cnt]))
3900                           ++cnt;
3901                         if (cnt > weight_len)
3902                           {
3903                             match_len = elem_len;
3904                             goto check_node_accept_bytes_match;
3905                           }
3906                       }
3907                   }
3908             }
3909         }
3910       else
3911 # endif /* _LIBC */
3912         {
3913           /* match with range expression?  */
3914 #if __GNUC__ >= 2
3915           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3916 #else
3917           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3918           cmp_buf[2] = wc;
3919 #endif
3920           for (i = 0; i < cset->nranges; ++i)
3921             {
3922               cmp_buf[0] = cset->range_starts[i];
3923               cmp_buf[4] = cset->range_ends[i];
3924               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3925                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3926                 {
3927                   match_len = char_len;
3928                   goto check_node_accept_bytes_match;
3929                 }
3930             }
3931         }
3932     check_node_accept_bytes_match:
3933       if (!cset->non_match)
3934         return match_len;
3935       else
3936         {
3937           if (match_len > 0)
3938             return 0;
3939           else
3940             return (elem_len > char_len) ? elem_len : char_len;
3941         }
3942     }
3943   return 0;
3944 }
3945
3946 # ifdef _LIBC
3947 static unsigned int
3948 find_collation_sequence_value (mbs, mbs_len)
3949     const unsigned char *mbs;
3950     size_t mbs_len;
3951 {
3952   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3953   if (nrules == 0)
3954     {
3955       if (mbs_len == 1)
3956         {
3957           /* No valid character.  Match it as a single byte character.  */
3958           const unsigned char *collseq = (const unsigned char *)
3959             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3960           return collseq[mbs[0]];
3961         }
3962       return UINT_MAX;
3963     }
3964   else
3965     {
3966       int32_t idx;
3967       const unsigned char *extra = (const unsigned char *)
3968         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3969       int32_t extrasize = (const unsigned char *)
3970         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3971
3972       for (idx = 0; idx < extrasize;)
3973         {
3974           int mbs_cnt, found = 0;
3975           int32_t elem_mbs_len;
3976           /* Skip the name of collating element name.  */
3977           idx = idx + extra[idx] + 1;
3978           elem_mbs_len = extra[idx++];
3979           if (mbs_len == elem_mbs_len)
3980             {
3981               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3982                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3983                   break;
3984               if (mbs_cnt == elem_mbs_len)
3985                 /* Found the entry.  */
3986                 found = 1;
3987             }
3988           /* Skip the byte sequence of the collating element.  */
3989           idx += elem_mbs_len;
3990           /* Adjust for the alignment.  */
3991           idx = (idx + 3) & ~3;
3992           /* Skip the collation sequence value.  */
3993           idx += sizeof (uint32_t);
3994           /* Skip the wide char sequence of the collating element.  */
3995           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3996           /* If we found the entry, return the sequence value.  */
3997           if (found)
3998             return *(uint32_t *) (extra + idx);
3999           /* Skip the collation sequence value.  */
4000           idx += sizeof (uint32_t);
4001         }
4002       return UINT_MAX;
4003     }
4004 }
4005 # endif /* _LIBC */
4006 #endif /* RE_ENABLE_I18N */
4007
4008 /* Check whether the node accepts the byte which is IDX-th
4009    byte of the INPUT.  */
4010
4011 static int
4012 check_node_accept (mctx, node, idx)
4013     const re_match_context_t *mctx;
4014     const re_token_t *node;
4015     int idx;
4016 {
4017   unsigned char ch;
4018   ch = re_string_byte_at (&mctx->input, idx);
4019   switch (node->type)
4020     {
4021     case CHARACTER:
4022       if (node->opr.c != ch)
4023         return 0;
4024       break;
4025
4026     case SIMPLE_BRACKET:
4027       if (!bitset_contain (node->opr.sbcset, ch))
4028         return 0;
4029       break;
4030
4031 #ifdef RE_ENABLE_I18N
4032     case OP_UTF8_PERIOD:
4033       if (ch >= 0x80)
4034         return 0;
4035       /* FALLTHROUGH */
4036 #endif
4037     case OP_PERIOD:
4038       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4039           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4040         return 0;
4041       break;
4042
4043     default:
4044       return 0;
4045     }
4046
4047   if (node->constraint)
4048     {
4049       /* The node has constraints.  Check whether the current context
4050          satisfies the constraints.  */
4051       unsigned int context = re_string_context_at (&mctx->input, idx,
4052                                                    mctx->eflags);
4053       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4054         return 0;
4055     }
4056
4057   return 1;
4058 }
4059
4060 /* Extend the buffers, if the buffers have run out.  */
4061
4062 static reg_errcode_t
4063 extend_buffers (mctx)
4064      re_match_context_t *mctx;
4065 {
4066   reg_errcode_t ret;
4067   re_string_t *pstr = &mctx->input;
4068
4069   /* Double the lengthes of the buffers.  */
4070   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4071   if (BE (ret != REG_NOERROR, 0))
4072     return ret;
4073
4074   if (mctx->state_log != NULL)
4075     {
4076       /* And double the length of state_log.  */
4077       /* XXX We have no indication of the size of this buffer.  If this
4078          allocation fail we have no indication that the state_log array
4079          does not have the right size.  */
4080       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4081                                               pstr->bufs_len + 1);
4082       if (BE (new_array == NULL, 0))
4083         return REG_ESPACE;
4084       mctx->state_log = new_array;
4085     }
4086
4087   /* Then reconstruct the buffers.  */
4088   if (pstr->icase)
4089     {
4090 #ifdef RE_ENABLE_I18N
4091       if (pstr->mb_cur_max > 1)
4092         {
4093           ret = build_wcs_upper_buffer (pstr);
4094           if (BE (ret != REG_NOERROR, 0))
4095             return ret;
4096         }
4097       else
4098 #endif /* RE_ENABLE_I18N  */
4099         build_upper_buffer (pstr);
4100     }
4101   else
4102     {
4103 #ifdef RE_ENABLE_I18N
4104       if (pstr->mb_cur_max > 1)
4105         build_wcs_buffer (pstr);
4106       else
4107 #endif /* RE_ENABLE_I18N  */
4108         {
4109           if (pstr->trans != NULL)
4110             re_string_translate_buffer (pstr);
4111         }
4112     }
4113   return REG_NOERROR;
4114 }
4115
4116 \f
4117 /* Functions for matching context.  */
4118
4119 /* Initialize MCTX.  */
4120
4121 static reg_errcode_t
4122 match_ctx_init (mctx, eflags, n)
4123     re_match_context_t *mctx;
4124     int eflags, n;
4125 {
4126   mctx->eflags = eflags;
4127   mctx->match_last = -1;
4128   if (n > 0)
4129     {
4130       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4131       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4132       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4133         return REG_ESPACE;
4134     }
4135   /* Already zero-ed by the caller.
4136      else
4137        mctx->bkref_ents = NULL;
4138      mctx->nbkref_ents = 0;
4139      mctx->nsub_tops = 0;  */
4140   mctx->abkref_ents = n;
4141   mctx->max_mb_elem_len = 1;
4142   mctx->asub_tops = n;
4143   return REG_NOERROR;
4144 }
4145
4146 /* Clean the entries which depend on the current input in MCTX.
4147    This function must be invoked when the matcher changes the start index
4148    of the input, or changes the input string.  */
4149
4150 static void
4151 match_ctx_clean (mctx)
4152     re_match_context_t *mctx;
4153 {
4154   int st_idx;
4155   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4156     {
4157       int sl_idx;
4158       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4159       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4160         {
4161           re_sub_match_last_t *last = top->lasts[sl_idx];
4162           re_free (last->path.array);
4163           re_free (last);
4164         }
4165       re_free (top->lasts);
4166       if (top->path)
4167         {
4168           re_free (top->path->array);
4169           re_free (top->path);
4170         }
4171       free (top);
4172     }
4173
4174   mctx->nsub_tops = 0;
4175   mctx->nbkref_ents = 0;
4176 }
4177
4178 /* Free all the memory associated with MCTX.  */
4179
4180 static void
4181 match_ctx_free (mctx)
4182     re_match_context_t *mctx;
4183 {
4184   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4185   match_ctx_clean (mctx);
4186   re_free (mctx->sub_tops);
4187   re_free (mctx->bkref_ents);
4188 }
4189
4190 /* Add a new backreference entry to MCTX.
4191    Note that we assume that caller never call this function with duplicate
4192    entry, and call with STR_IDX which isn't smaller than any existing entry.
4193 */
4194
4195 static reg_errcode_t
4196 match_ctx_add_entry (mctx, node, str_idx, from, to)
4197      re_match_context_t *mctx;
4198      int node, str_idx, from, to;
4199 {
4200   if (mctx->nbkref_ents >= mctx->abkref_ents)
4201     {
4202       struct re_backref_cache_entry* new_entry;
4203       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4204                               mctx->abkref_ents * 2);
4205       if (BE (new_entry == NULL, 0))
4206         {
4207           re_free (mctx->bkref_ents);
4208           return REG_ESPACE;
4209         }
4210       mctx->bkref_ents = new_entry;
4211       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4212               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4213       mctx->abkref_ents *= 2;
4214     }
4215   if (mctx->nbkref_ents > 0
4216       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4217     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4218
4219   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4220   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4221   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4222   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4223
4224   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4225      If bit N is clear, means that this entry won't epsilon-transition to
4226      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4227      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4228      such node.
4229
4230      A backreference does not epsilon-transition unless it is empty, so set
4231      to all zeros if FROM != TO.  */
4232   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4233     = (from == to ? ~0 : 0);
4234
4235   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4236   if (mctx->max_mb_elem_len < to - from)
4237     mctx->max_mb_elem_len = to - from;
4238   return REG_NOERROR;
4239 }
4240
4241 /* Search for the first entry which has the same str_idx, or -1 if none is
4242    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4243
4244 static int
4245 search_cur_bkref_entry (mctx, str_idx)
4246      re_match_context_t *mctx;
4247      int str_idx;
4248 {
4249   int left, right, mid, last;
4250   last = right = mctx->nbkref_ents;
4251   for (left = 0; left < right;)
4252     {
4253       mid = (left + right) / 2;
4254       if (mctx->bkref_ents[mid].str_idx < str_idx)
4255         left = mid + 1;
4256       else
4257         right = mid;
4258     }
4259   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4260     return left;
4261   else
4262     return -1;
4263 }
4264
4265 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4266    at STR_IDX.  */
4267
4268 static reg_errcode_t
4269 match_ctx_add_subtop (mctx, node, str_idx)
4270      re_match_context_t *mctx;
4271      int node, str_idx;
4272 {
4273 #ifdef DEBUG
4274   assert (mctx->sub_tops != NULL);
4275   assert (mctx->asub_tops > 0);
4276 #endif
4277   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4278     {
4279       int new_asub_tops = mctx->asub_tops * 2;
4280       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4281                                                    re_sub_match_top_t *,
4282                                                    new_asub_tops);
4283       if (BE (new_array == NULL, 0))
4284         return REG_ESPACE;
4285       mctx->sub_tops = new_array;
4286       mctx->asub_tops = new_asub_tops;
4287     }
4288   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4289   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4290     return REG_ESPACE;
4291   mctx->sub_tops[mctx->nsub_tops]->node = node;
4292   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4293   return REG_NOERROR;
4294 }
4295
4296 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4297    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4298
4299 static re_sub_match_last_t *
4300 match_ctx_add_sublast (subtop, node, str_idx)
4301      re_sub_match_top_t *subtop;
4302      int node, str_idx;
4303 {
4304   re_sub_match_last_t *new_entry;
4305   if (BE (subtop->nlasts == subtop->alasts, 0))
4306     {
4307       int new_alasts = 2 * subtop->alasts + 1;
4308       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4309                                                     re_sub_match_last_t *,
4310                                                     new_alasts);
4311       if (BE (new_array == NULL, 0))
4312         return NULL;
4313       subtop->lasts = new_array;
4314       subtop->alasts = new_alasts;
4315     }
4316   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4317   if (BE (new_entry != NULL, 1))
4318     {
4319       subtop->lasts[subtop->nlasts] = new_entry;
4320       new_entry->node = node;
4321       new_entry->str_idx = str_idx;
4322       ++subtop->nlasts;
4323     }
4324   return new_entry;
4325 }
4326
4327 static void
4328 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4329     re_sift_context_t *sctx;
4330     re_dfastate_t **sifted_sts, **limited_sts;
4331     int last_node, last_str_idx;
4332 {
4333   sctx->sifted_states = sifted_sts;
4334   sctx->limited_states = limited_sts;
4335   sctx->last_node = last_node;
4336   sctx->last_str_idx = last_str_idx;
4337   re_node_set_init_empty (&sctx->limits);
4338 }