lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      Idx n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
  25                                           Idx str_idx, Idx from, Idx to)
  26      internal_function;
  27 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
  30                                            Idx str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                     Idx node, Idx str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, Idx last_node,
  36                            Idx last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, Idx length,
  40                                          Idx start, regoff_t range, Idx stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
  44                                   const char *string1, Idx length1,
  45                                   const char *string2, Idx length2,
  46                                   Idx start, regoff_t range,
  47                                   struct re_registers *regs,
  48                                   Idx stop, int ret_len) internal_function;
  49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
  50                                 const char *string, Idx length, Idx start,
  51                                 regoff_t range, Idx stop,
  52                                 struct re_registers *regs,
  53                                 int ret_len) internal_function;
  54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  55                               Idx nregs, int regs_allocated) internal_function;
  56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  57      internal_function;
  58 static Idx check_matching (re_match_context_t *mctx, int fl_longest_match,
  59                            Idx *p_match_first)
  60      internal_function;
  61 static Idx check_halt_state_context (const re_match_context_t *mctx,
  62                                      const re_dfastate_t *state, Idx idx)
  63      internal_function;
  64 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  65                          regmatch_t *prev_idx_match, Idx cur_node,
  66                          Idx cur_idx, Idx nmatch) internal_function;
  67 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  68                                       Idx str_idx, Idx dest_node, Idx nregs,
  69                                       regmatch_t *regs,
  70                                       re_node_set *eps_via_nodes) internal_function;
  71 static reg_errcode_t set_regs (const regex_t *preg,
  72                                const re_match_context_t *mctx,
  73                                size_t nmatch, regmatch_t *pmatch,
  74                                int fl_backtrack) internal_function;
  75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
  81 #endif /* RE_ENABLE_I18N */
  82 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  83                                            re_sift_context_t *sctx) internal_function;
  84 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  85                                           re_sift_context_t *sctx, Idx str_idx,
  86                                           re_node_set *cur_dest) internal_function;
  87 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  88                                               re_sift_context_t *sctx,
  89                                               Idx str_idx,
  90                                               re_node_set *dest_nodes) internal_function;
  91 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
  92                                             re_node_set *dest_nodes,
  93                                             const re_node_set *candidates) internal_function;
  94 static int check_dst_limits (const re_match_context_t *mctx,
  95                              const re_node_set *limits,
  96                              Idx dst_node, Idx dst_idx, Idx src_node,
  97                              Idx src_idx) internal_function;
  98 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
  99                                         int boundaries, Idx subexp_idx,
 100                                         Idx from_node, Idx bkref_idx) internal_function;
 101 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 102                                       Idx limit, Idx subexp_idx,
 103                                       Idx node, Idx str_idx,
 104                                       Idx bkref_idx) internal_function;
 105 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 106                                           re_node_set *dest_nodes,
 107                                           const re_node_set *candidates,
 108                                           re_node_set *limits,
 109                                           struct re_backref_cache_entry *bkref_ents,
 110                                           Idx str_idx) internal_function;
 111 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 112                                         re_sift_context_t *sctx,
 113                                         Idx str_idx, const re_node_set *candidates) internal_function;
 114 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 115                                         re_dfastate_t **src, Idx num) internal_function;
 116 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 117                                          re_match_context_t *mctx) internal_function;
 118 static re_dfastate_t *transit_state (reg_errcode_t *err,
 119                                      re_match_context_t *mctx,
 120                                      re_dfastate_t *state) internal_function;
 121 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 122                                             re_match_context_t *mctx,
 123                                             re_dfastate_t *next_state) internal_function;
 124 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 125                                                 re_node_set *cur_nodes,
 126                                                 Idx str_idx) internal_function;
 127 #if 0
 128 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 129                                         re_match_context_t *mctx,
 130                                         re_dfastate_t *pstate) internal_function;
 131 #endif
 132 #ifdef RE_ENABLE_I18N
 133 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 134                                        re_dfastate_t *pstate) internal_function;
 135 #endif /* RE_ENABLE_I18N */
 136 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 137                                           const re_node_set *nodes) internal_function;
 138 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 139                                  Idx bkref_node, Idx bkref_str_idx) internal_function;
 140 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 141                                      const re_sub_match_top_t *sub_top,
 142                                      re_sub_match_last_t *sub_last,
 143                                      Idx bkref_node, Idx bkref_str) internal_function;
 144 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 145                              Idx subexp_idx, int type) internal_function;
 146 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 147                                     state_array_t *path, Idx top_node,
 148                                     Idx top_str, Idx last_node, Idx last_str,
 149                                     int type) internal_function;
 150 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 151                                                    Idx str_idx,
 152                                                    re_node_set *cur_nodes,
 153                                                    re_node_set *next_nodes) internal_function;
 154 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 155                                                re_node_set *cur_nodes,
 156                                                Idx ex_subexp, int type) internal_function;
 157 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 158                                                    re_node_set *dst_nodes,
 159                                                    Idx target, Idx ex_subexp,
 160                                                    int type) internal_function;
 161 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 162                                          re_node_set *cur_nodes, Idx cur_str,
 163                                          Idx subexp_num, int type) internal_function;
 164 static int build_trtable (re_dfa_t *dfa,
 165                           re_dfastate_t *state) internal_function;
 166 #ifdef RE_ENABLE_I18N
 167 static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
 168                                     const re_string_t *input, Idx idx) internal_function;
 169 # ifdef _LIBC
 170 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 171                                                    size_t name_len) internal_function;
 172 # endif /* _LIBC */
 173 #endif /* RE_ENABLE_I18N */
 174 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
 175                                        const re_dfastate_t *state,
 176                                        re_node_set *states_node,
 177                                        bitset *states_ch) internal_function;
 178 static int check_node_accept (const re_match_context_t *mctx,
 179                               const re_token_t *node, Idx idx) internal_function;
 180 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 181 \f
 182 /* Entry point for POSIX code.  */
 183
 184 /* regexec searches for a given pattern, specified by PREG, in the
 185    string STRING.
 186
 187    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 188    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 189    least NMATCH elements, and we set them to the offsets of the
 190    corresponding matched substrings.
 191
 192    EFLAGS specifies `execution flags' which affect matching: if
 193    REG_NOTBOL is set, then ^ does not match at the beginning of the
 194    string; if REG_NOTEOL is set, then $ does not match at the end.
 195
 196    We return 0 if we find a match and REG_NOMATCH if not.  */
 197
 198 int
 199 regexec (const regex_t *__restrict preg, const char *__restrict string,
 200          size_t nmatch, regmatch_t pmatch[], int eflags)
 201 {
 202   reg_errcode_t err;
 203   Idx start, length;
 204 #ifdef _LIBC
 205   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 206 #endif
 207
 208   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 209     return REG_BADPAT;
 210
 211   if (eflags & REG_STARTEND)
 212     {
 213       start = pmatch[0].rm_so;
 214       length = pmatch[0].rm_eo;
 215     }
 216   else
 217     {
 218       start = 0;
 219       length = strlen (string);
 220     }
 221
 222   __libc_lock_lock (dfa->lock);
 223   if (preg->re_no_sub)
 224     err = re_search_internal (preg, string, length, start, length - start,
 225                               length, 0, NULL, eflags);
 226   else
 227     err = re_search_internal (preg, string, length, start, length - start,
 228                               length, nmatch, pmatch, eflags);
 229   __libc_lock_unlock (dfa->lock);
 230   return err != REG_NOERROR;
 231 }
 232
 233 #ifdef _LIBC
 234 # include <shlib-compat.h>
 235 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 236
 237 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 238 __typeof__ (__regexec) __compat_regexec;
 239
 240 int
 241 attribute_compat_text_section
 242 __compat_regexec (const regex_t *__restrict preg,
 243                   const char *__restrict string, size_t nmatch,
 244                   regmatch_t pmatch[], int eflags)
 245 {
 246   return regexec (preg, string, nmatch, pmatch,
 247                   eflags & (REG_NOTBOL | REG_NOTEOL));
 248 }
 249 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 250 # endif
 251 #endif
 252
 253 /* Entry points for GNU code.  */
 254
 255 /* re_match, re_search, re_match_2, re_search_2
 256
 257    The former two functions operate on STRING with length LENGTH,
 258    while the later two operate on concatenation of STRING1 and STRING2
 259    with lengths LENGTH1 and LENGTH2, respectively.
 260
 261    re_match() matches the compiled pattern in BUFP against the string,
 262    starting at index START.
 263
 264    re_search() first tries matching at index START, then it tries to match
 265    starting from index START + 1, and so on.  The last start position tried
 266    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 267    way as re_match().)
 268
 269    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 270    the first STOP characters of the concatenation of the strings should be
 271    concerned.
 272
 273    If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
 274    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 275    computed relative to the concatenation, not relative to the individual
 276    strings.)
 277
 278    On success, re_match* functions return the length of the match, re_search*
 279    return the position of the start of the match.  Return value -1 means no
 280    match was found and -2 indicates an internal error.  */
 281
 282 regoff_t
 283 re_match (struct re_pattern_buffer *bufp, const char *string,
 284           Idx length, Idx start, struct re_registers *regs)
 285 {
 286   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 287 }
 288 #ifdef _LIBC
 289 weak_alias (__re_match, re_match)
 290 #endif
 291
 292 regoff_t
 293 re_search (struct re_pattern_buffer *bufp, const char *string,
 294            Idx length, Idx start, regoff_t range, struct re_registers *regs)
 295 {
 296   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 297 }
 298 #ifdef _LIBC
 299 weak_alias (__re_search, re_search)
 300 #endif
 301
 302 regoff_t
 303 re_match_2 (struct re_pattern_buffer *bufp,
 304             const char *string1, Idx length1,
 305             const char *string2, Idx length2,
 306             Idx start, struct re_registers *regs, Idx stop)
 307 {
 308   return re_search_2_stub (bufp, string1, length1, string2, length2,
 309                            start, 0, regs, stop, 1);
 310 }
 311 #ifdef _LIBC
 312 weak_alias (__re_match_2, re_match_2)
 313 #endif
 314
 315 regoff_t
 316 re_search_2 (struct re_pattern_buffer *bufp,
 317              const char *string1, Idx length1,
 318              const char *string2, Idx length2,
 319              Idx start, regoff_t range, struct re_registers *regs, Idx stop)
 320 {
 321   return re_search_2_stub (bufp, string1, length1, string2, length2,
 322                            start, range, regs, stop, 0);
 323 }
 324 #ifdef _LIBC
 325 weak_alias (__re_search_2, re_search_2)
 326 #endif
 327
 328 static regoff_t
 329 internal_function
 330 re_search_2_stub (struct re_pattern_buffer *bufp,
 331                   const char *string1, Idx length1,
 332                   const char *string2, Idx length2,
 333                   Idx start, regoff_t range, struct re_registers *regs,
 334                   Idx stop, int ret_len)
 335 {
 336   const char *str;
 337   regoff_t rval;
 338   Idx len = length1 + length2;
 339   int free_str = 0;
 340
 341   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 342     return -2;
 343
 344   /* Concatenate the strings.  */
 345   if (length2 > 0)
 346     if (length1 > 0)
 347       {
 348         char *s = re_malloc (char, len);
 349
 350         if (BE (s == NULL, 0))
 351           return -2;
 352         memcpy (s, string1, length1);
 353         memcpy (s + length1, string2, length2);
 354         str = s;
 355         free_str = 1;
 356       }
 357     else
 358       str = string2;
 359   else
 360     str = string1;
 361
 362   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 363                          ret_len);
 364   if (free_str)
 365     re_free ((char *) str);
 366   return rval;
 367 }
 368
 369 /* The parameters have the same meaning as those of re_search.
 370    Additional parameters:
 371    If RET_LEN is nonzero the length of the match is returned (re_match style);
 372    otherwise the position of the match is returned.  */
 373
 374 static regoff_t
 375 internal_function
 376 re_search_stub (struct re_pattern_buffer *bufp,
 377                 const char *string, Idx length,
 378                 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
 379                 int ret_len)
 380 {
 381   reg_errcode_t result;
 382   regmatch_t *pmatch;
 383   Idx nregs;
 384   regoff_t rval;
 385   int eflags = 0;
 386 #ifdef _LIBC
 387   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 388 #endif
 389
 390   /* Check for out-of-range.  */
 391   if (BE (start < 0 || start > length, 0))
 392     return -1;
 393   if (sizeof start < sizeof range)
 394     {
 395       regoff_t length_offset = length;
 396       regoff_t start_offset = start;
 397       if (BE (length_offset - start_offset < range, 0))
 398         range = length_offset - start_offset;
 399       else if (BE (range < - start_offset, 0))
 400         range = -start_offset;
 401     }
 402   else
 403     {
 404       if (BE (start + range > length, 0))
 405         range = length - start;
 406       else if (BE (start + range < 0, 0))
 407         range = -start;
 408     }
 409
 410   __libc_lock_lock (dfa->lock);
 411
 412   eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
 413   eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
 414
 415   /* Compile fastmap if we haven't yet.  */
 416   if (range > 0 && bufp->re_fastmap != NULL && !bufp->re_fastmap_accurate)
 417     re_compile_fastmap (bufp);
 418
 419   if (BE (bufp->re_no_sub, 0))
 420     regs = NULL;
 421
 422   /* We need at least 1 register.  */
 423   if (regs == NULL)
 424     nregs = 1;
 425   else if (BE (bufp->re_regs_allocated == REG_FIXED
 426                && regs->rm_num_regs < bufp->re_nsub + 1, 0))
 427     {
 428       nregs = regs->rm_num_regs;
 429       if (BE (nregs < 1, 0))
 430         {
 431           /* Nothing can be copied to regs.  */
 432           regs = NULL;
 433           nregs = 1;
 434         }
 435     }
 436   else
 437     nregs = bufp->re_nsub + 1;
 438   pmatch = re_malloc (regmatch_t, nregs);
 439   if (BE (pmatch == NULL, 0))
 440     {
 441       rval = -2;
 442       goto out;
 443     }
 444
 445   result = re_search_internal (bufp, string, length, start, range, stop,
 446                                nregs, pmatch, eflags);
 447
 448   rval = 0;
 449
 450   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 451   if (result != REG_NOERROR)
 452     rval = -1;
 453   else if (regs != NULL)
 454     {
 455       /* If caller wants register contents data back, copy them.  */
 456       bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
 457                                               bufp->re_regs_allocated);
 458       if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
 459         rval = -2;
 460     }
 461
 462   if (BE (rval == 0, 1))
 463     {
 464       if (ret_len)
 465         {
 466           assert (pmatch[0].rm_so == start);
 467           rval = pmatch[0].rm_eo - start;
 468         }
 469       else
 470         rval = pmatch[0].rm_so;
 471     }
 472   re_free (pmatch);
 473  out:
 474   __libc_lock_unlock (dfa->lock);
 475   return rval;
 476 }
 477
 478 static unsigned
 479 internal_function
 480 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
 481               int regs_allocated)
 482 {
 483   int rval = REG_REALLOCATE;
 484   Idx i;
 485   Idx need_regs = nregs + 1;
 486   /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
 487      uses.  */
 488
 489   /* Have the register data arrays been allocated?  */
 490   if (regs_allocated == REG_UNALLOCATED)
 491     { /* No.  So allocate them with malloc.  */
 492       regs->rm_start = re_malloc (regoff_t, need_regs);
 493       regs->rm_end = re_malloc (regoff_t, need_regs);
 494       if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
 495         return REG_UNALLOCATED;
 496       regs->rm_num_regs = need_regs;
 497     }
 498   else if (regs_allocated == REG_REALLOCATE)
 499     { /* Yes.  If we need more elements than were already
 500          allocated, reallocate them.  If we need fewer, just
 501          leave it alone.  */
 502       if (BE (need_regs > regs->rm_num_regs, 0))
 503         {
 504           regoff_t *new_start =
 505             re_realloc (regs->rm_start, regoff_t, need_regs);
 506           regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
 507           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 508             return REG_UNALLOCATED;
 509           regs->rm_start = new_start;
 510           regs->rm_end = new_end;
 511           regs->rm_num_regs = need_regs;
 512         }
 513     }
 514   else
 515     {
 516       assert (regs_allocated == REG_FIXED);
 517       /* This function may not be called with REG_FIXED and nregs too big.  */
 518       assert (regs->rm_num_regs >= nregs);
 519       rval = REG_FIXED;
 520     }
 521
 522   /* Copy the regs.  */
 523   for (i = 0; i < nregs; ++i)
 524     {
 525       regs->rm_start[i] = pmatch[i].rm_so;
 526       regs->rm_end[i] = pmatch[i].rm_eo;
 527     }
 528   for ( ; i < regs->rm_num_regs; ++i)
 529     regs->rm_start[i] = regs->rm_end[i] = -1;
 530
 531   return rval;
 532 }
 533
 534 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 535    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 536    this memory for recording register information.  STARTS and ENDS
 537    must be allocated using the malloc library routine, and must each
 538    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 539
 540    If NUM_REGS == 0, then subsequent matches should allocate their own
 541    register data.
 542
 543    Unless this function is called, the first search or match using
 544    PATTERN_BUFFER will allocate its own register data, without
 545    freeing the old data.  */
 546
 547 void
 548 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 549                   __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
 550 {
 551   if (num_regs)
 552     {
 553       bufp->re_regs_allocated = REG_REALLOCATE;
 554       regs->rm_num_regs = num_regs;
 555       regs->rm_start = starts;
 556       regs->rm_end = ends;
 557     }
 558   else
 559     {
 560       bufp->re_regs_allocated = REG_UNALLOCATED;
 561       regs->rm_num_regs = 0;
 562       regs->rm_start = regs->rm_end = NULL;
 563     }
 564 }
 565 #ifdef _LIBC
 566 weak_alias (__re_set_registers, re_set_registers)
 567 #endif
 568 \f
 569 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 570    them unless specifically requested.  */
 571
 572 #if defined _REGEX_RE_COMP || defined _LIBC
 573 int
 574 # ifdef _LIBC
 575 weak_function
 576 # endif
 577 re_exec (const char *s)
 578 {
 579   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 580 }
 581 #endif /* _REGEX_RE_COMP */
 582 \f
 583 /* Internal entry point.  */
 584
 585 /* Searches for a compiled pattern PREG in the string STRING, whose
 586    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 587    mingings with regexec.  START, and RANGE have the same meanings
 588    with re_search.
 589    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 590    otherwise return the error code.
 591    Note: We assume front end functions already check ranges.
 592    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 593
 594 static reg_errcode_t
 595 internal_function
 596 re_search_internal (const regex_t *preg,
 597                     const char *string, Idx length,
 598                     Idx start, regoff_t range, Idx stop,
 599                     size_t nmatch, regmatch_t pmatch[],
 600                     int eflags)
 601 {
 602   reg_errcode_t err;
 603   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 604   Idx left_lim, right_lim;
 605   int incr;
 606   int fl_longest_match, match_kind;
 607   Idx match_first, match_last = REG_MISSING;
 608   Idx extra_nmatch;
 609   int sb, ch;
 610 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 611   re_match_context_t mctx = { .dfa = dfa };
 612 #else
 613   re_match_context_t mctx;
 614 #endif
 615   char *fastmap = (preg->re_fastmap != NULL && preg->re_fastmap_accurate
 616                    && range && !preg->re_can_be_null) ? preg->re_fastmap : NULL;
 617   unsigned REG_TRANSLATE_TYPE t =
 618     (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
 619
 620 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 621   memset (&mctx, '\0', sizeof (re_match_context_t));
 622   mctx.dfa = dfa;
 623 #endif
 624
 625   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 626   nmatch -= extra_nmatch;
 627
 628   /* Check if the DFA haven't been compiled.  */
 629   if (BE (preg->re_used == 0 || dfa->init_state == NULL
 630           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 631           || dfa->init_state_begbuf == NULL, 0))
 632     return REG_NOMATCH;
 633
 634 #ifdef DEBUG
 635   /* We assume front-end functions already check them.  */
 636   assert (start + range >= 0 && start + range <= length);
 637 #endif
 638
 639   /* If initial states with non-begbuf contexts have no elements,
 640      the regex must be anchored.  If preg->re_newline_anchor is set,
 641      we'll never use init_state_nl, so do not check it.  */
 642   if (dfa->init_state->nodes.nelem == 0
 643       && dfa->init_state_word->nodes.nelem == 0
 644       && (dfa->init_state_nl->nodes.nelem == 0
 645           || !preg->re_newline_anchor))
 646     {
 647       if (start != 0 && start + range != 0)
 648         return REG_NOMATCH;
 649       start = range = 0;
 650     }
 651
 652   /* We must check the longest matching, if nmatch > 0.  */
 653   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 654
 655   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 656                             preg->re_translate,
 657                             preg->re_syntax & REG_IGNORE_CASE, dfa);
 658   if (BE (err != REG_NOERROR, 0))
 659     goto free_return;
 660   mctx.input.stop = stop;
 661   mctx.input.raw_stop = stop;
 662   mctx.input.newline_anchor = preg->re_newline_anchor;
 663
 664   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 665   if (BE (err != REG_NOERROR, 0))
 666     goto free_return;
 667
 668   /* We will log all the DFA states through which the dfa pass,
 669      if nmatch > 1, or this dfa has "multibyte node", which is a
 670      back-reference or a node which can accept multibyte character or
 671      multi character collating element.  */
 672   if (nmatch > 1 || dfa->has_mb_node)
 673     {
 674       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 675       if (BE (mctx.state_log == NULL, 0))
 676         {
 677           err = REG_ESPACE;
 678           goto free_return;
 679         }
 680     }
 681   else
 682     mctx.state_log = NULL;
 683
 684   match_first = start;
 685   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 686                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 687
 688   /* Check incrementally whether of not the input string match.  */
 689   incr = (range < 0) ? -1 : 1;
 690   left_lim = (range < 0) ? start + range : start;
 691   right_lim = (range < 0) ? start : start + range;
 692   sb = dfa->mb_cur_max == 1;
 693   match_kind =
 694     (fastmap
 695      ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
 696         | (range >= 0 ? 2 : 0)
 697         | (t != NULL ? 1 : 0))
 698      : 8);
 699
 700   for (;; match_first += incr)
 701     {
 702       err = REG_NOMATCH;
 703       if (match_first < left_lim || right_lim < match_first)
 704         goto free_return;
 705
 706       /* Advance as rapidly as possible through the string, until we
 707          find a plausible place to start matching.  This may be done
 708          with varying efficiency, so there are various possibilities:
 709          only the most common of them are specialized, in order to
 710          save on code size.  We use a switch statement for speed.  */
 711       switch (match_kind)
 712         {
 713         case 8:
 714           /* No fastmap.  */
 715           break;
 716
 717         case 7:
 718           /* Fastmap with single-byte translation, match forward.  */
 719           while (BE (match_first < right_lim, 1)
 720                  && !fastmap[t[(unsigned char) string[match_first]]])
 721             ++match_first;
 722           goto forward_match_found_start_or_reached_end;
 723
 724         case 6:
 725           /* Fastmap without translation, match forward.  */
 726           while (BE (match_first < right_lim, 1)
 727                  && !fastmap[(unsigned char) string[match_first]])
 728             ++match_first;
 729
 730         forward_match_found_start_or_reached_end:
 731           if (BE (match_first == right_lim, 0))
 732             {
 733               ch = match_first >= length
 734                        ? 0 : (unsigned char) string[match_first];
 735               if (!fastmap[t ? t[ch] : ch])
 736                 goto free_return;
 737             }
 738           break;
 739
 740         case 4:
 741         case 5:
 742           /* Fastmap without multi-byte translation, match backwards.  */
 743           while (match_first >= left_lim)
 744             {
 745               ch = match_first >= length
 746                        ? 0 : (unsigned char) string[match_first];
 747               if (fastmap[t ? t[ch] : ch])
 748                 break;
 749               --match_first;
 750             }
 751           if (match_first < left_lim)
 752             goto free_return;
 753           break;
 754
 755         default:
 756           /* In this case, we can't determine easily the current byte,
 757              since it might be a component byte of a multibyte
 758              character.  Then we use the constructed buffer instead.  */
 759           for (;;)
 760             {
 761               /* If MATCH_FIRST is out of the valid range, reconstruct the
 762                  buffers.  */
 763               __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
 764               if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
 765                 {
 766                   err = re_string_reconstruct (&mctx.input, match_first,
 767                                                eflags);
 768                   if (BE (err != REG_NOERROR, 0))
 769                     goto free_return;
 770
 771                   offset = match_first - mctx.input.raw_mbs_idx;
 772                 }
 773               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 774                  Note that MATCH_FIRST must not be smaller than 0.  */
 775               ch = (match_first >= length
 776                     ? 0 : re_string_byte_at (&mctx.input, offset));
 777               if (fastmap[ch])
 778                 break;
 779               match_first += incr;
 780               if (match_first < left_lim || match_first > right_lim)
 781                 {
 782                   err = REG_NOMATCH;
 783                   goto free_return;
 784                 }
 785             }
 786           break;
 787         }
 788
 789       /* Reconstruct the buffers so that the matcher can assume that
 790          the matching starts from the beginning of the buffer.  */
 791       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 792       if (BE (err != REG_NOERROR, 0))
 793         goto free_return;
 794
 795 #ifdef RE_ENABLE_I18N
 796      /* Don't consider this char as a possible match start if it part,
 797         yet isn't the head, of a multibyte character.  */
 798       if (!sb && !re_string_first_byte (&mctx.input, 0))
 799         continue;
 800 #endif
 801
 802       /* It seems to be appropriate one, then use the matcher.  */
 803       /* We assume that the matching starts from 0.  */
 804       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 805       match_last = check_matching (&mctx, fl_longest_match,
 806                                    range >= 0 ? &match_first : NULL);
 807       if (match_last != REG_MISSING)
 808         {
 809           if (BE (match_last == REG_ERROR, 0))
 810             {
 811               err = REG_ESPACE;
 812               goto free_return;
 813             }
 814           else
 815             {
 816               mctx.match_last = match_last;
 817               if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
 818                 {
 819                   re_dfastate_t *pstate = mctx.state_log[match_last];
 820                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 821                                                              match_last);
 822                 }
 823               if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
 824                   || dfa->nbackref)
 825                 {
 826                   err = prune_impossible_nodes (&mctx);
 827                   if (err == REG_NOERROR)
 828                     break;
 829                   if (BE (err != REG_NOMATCH, 0))
 830                     goto free_return;
 831                   match_last = REG_MISSING;
 832                 }
 833               else
 834                 break; /* We found a match.  */
 835             }
 836         }
 837
 838       match_ctx_clean (&mctx);
 839     }
 840
 841 #ifdef DEBUG
 842   assert (match_last != REG_MISSING);
 843   assert (err == REG_NOERROR);
 844 #endif
 845
 846   /* Set pmatch[] if we need.  */
 847   if (nmatch > 0)
 848     {
 849       Idx reg_idx;
 850
 851       /* Initialize registers.  */
 852       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 853         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 854
 855       /* Set the points where matching start/end.  */
 856       pmatch[0].rm_so = 0;
 857       pmatch[0].rm_eo = mctx.match_last;
 858       /* FIXME: This function should fail if mctx.match_last exceeds
 859          the maximum possible regoff_t value.  We need a new error
 860          code REG_OVERFLOW.  */
 861
 862       if (!preg->re_no_sub && nmatch > 1)
 863         {
 864           err = set_regs (preg, &mctx, nmatch, pmatch,
 865                           dfa->has_plural_match && dfa->nbackref > 0);
 866           if (BE (err != REG_NOERROR, 0))
 867             goto free_return;
 868         }
 869
 870       /* At last, add the offset to the each registers, since we slided
 871          the buffers so that we could assume that the matching starts
 872          from 0.  */
 873       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 874         if (pmatch[reg_idx].rm_so != -1)
 875           {
 876 #ifdef RE_ENABLE_I18N
 877             if (BE (mctx.input.offsets_needed != 0, 0))
 878               {
 879                 pmatch[reg_idx].rm_so =
 880                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 881                    ? mctx.input.valid_raw_len
 882                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 883                 pmatch[reg_idx].rm_eo =
 884                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 885                    ? mctx.input.valid_raw_len
 886                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 887               }
 888 #else
 889             assert (mctx.input.offsets_needed == 0);
 890 #endif
 891             pmatch[reg_idx].rm_so += match_first;
 892             pmatch[reg_idx].rm_eo += match_first;
 893           }
 894       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 895         {
 896           pmatch[nmatch + reg_idx].rm_so = -1;
 897           pmatch[nmatch + reg_idx].rm_eo = -1;
 898         }
 899
 900       if (dfa->subexp_map)
 901         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 902           if (dfa->subexp_map[reg_idx] != reg_idx)
 903             {
 904               pmatch[reg_idx + 1].rm_so
 905                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 906               pmatch[reg_idx + 1].rm_eo
 907                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 908             }
 909     }
 910
 911  free_return:
 912   re_free (mctx.state_log);
 913   if (dfa->nbackref)
 914     match_ctx_free (&mctx);
 915   re_string_destruct (&mctx.input);
 916   return err;
 917 }
 918
 919 static reg_errcode_t
 920 internal_function
 921 prune_impossible_nodes (re_match_context_t *mctx)
 922 {
 923   re_dfa_t *const dfa = mctx->dfa;
 924   Idx halt_node, match_last;
 925   reg_errcode_t ret;
 926   re_dfastate_t **sifted_states;
 927   re_dfastate_t **lim_states = NULL;
 928   re_sift_context_t sctx;
 929 #ifdef DEBUG
 930   assert (mctx->state_log != NULL);
 931 #endif
 932   match_last = mctx->match_last;
 933   halt_node = mctx->last_node;
 934   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 935   if (BE (sifted_states == NULL, 0))
 936     {
 937       ret = REG_ESPACE;
 938       goto free_return;
 939     }
 940   if (dfa->nbackref)
 941     {
 942       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 943       if (BE (lim_states == NULL, 0))
 944         {
 945           ret = REG_ESPACE;
 946           goto free_return;
 947         }
 948       while (1)
 949         {
 950           memset (lim_states, '\0',
 951                   sizeof (re_dfastate_t *) * (match_last + 1));
 952           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 953                          match_last);
 954           ret = sift_states_backward (mctx, &sctx);
 955           re_node_set_free (&sctx.limits);
 956           if (BE (ret != REG_NOERROR, 0))
 957               goto free_return;
 958           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 959             break;
 960           do
 961             {
 962               --match_last;
 963               if (! REG_VALID_INDEX (match_last))
 964                 {
 965                   ret = REG_NOMATCH;
 966                   goto free_return;
 967                 }
 968             } while (mctx->state_log[match_last] == NULL
 969                      || !mctx->state_log[match_last]->halt);
 970           halt_node = check_halt_state_context (mctx,
 971                                                 mctx->state_log[match_last],
 972                                                 match_last);
 973         }
 974       ret = merge_state_array (dfa, sifted_states, lim_states,
 975                                match_last + 1);
 976       re_free (lim_states);
 977       lim_states = NULL;
 978       if (BE (ret != REG_NOERROR, 0))
 979         goto free_return;
 980     }
 981   else
 982     {
 983       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 984       ret = sift_states_backward (mctx, &sctx);
 985       re_node_set_free (&sctx.limits);
 986       if (BE (ret != REG_NOERROR, 0))
 987         goto free_return;
 988     }
 989   re_free (mctx->state_log);
 990   mctx->state_log = sifted_states;
 991   sifted_states = NULL;
 992   mctx->last_node = halt_node;
 993   mctx->match_last = match_last;
 994   ret = REG_NOERROR;
 995  free_return:
 996   re_free (sifted_states);
 997   re_free (lim_states);
 998   return ret;
 999 }
1000
1001 /* Acquire an initial state and return it.
1002    We must select appropriate initial state depending on the context,
1003    since initial states may have constraints like "\<", "^", etc..  */
1004
1005 static inline re_dfastate_t *
1006 __attribute ((always_inline)) internal_function
1007 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1008                             Idx idx)
1009 {
1010   re_dfa_t *const dfa = mctx->dfa;
1011   if (dfa->init_state->has_constraint)
1012     {
1013       unsigned int context;
1014       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1015       if (IS_WORD_CONTEXT (context))
1016         return dfa->init_state_word;
1017       else if (IS_ORDINARY_CONTEXT (context))
1018         return dfa->init_state;
1019       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1020         return dfa->init_state_begbuf;
1021       else if (IS_NEWLINE_CONTEXT (context))
1022         return dfa->init_state_nl;
1023       else if (IS_BEGBUF_CONTEXT (context))
1024         {
1025           /* It is relatively rare case, then calculate on demand.  */
1026           return re_acquire_state_context (err, dfa,
1027                                            dfa->init_state->entrance_nodes,
1028                                            context);
1029         }
1030       else
1031         /* Must not happen?  */
1032         return dfa->init_state;
1033     }
1034   else
1035     return dfa->init_state;
1036 }
1037
1038 /* Check whether the regular expression match input string INPUT or not,
1039    and return the index where the matching end.  Return REG_MISSING if
1040    there is no match, and return REG_ERROR in case of an error.
1041    FL_LONGEST_MATCH means we want the POSIX longest matching.
1042    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1043    next place where we may want to try matching.
1044    Note that the matcher assume that the maching starts from the current
1045    index of the buffer.  */
1046
1047 static Idx
1048 internal_function
1049 check_matching (re_match_context_t *mctx, int fl_longest_match,
1050                 Idx *p_match_first)
1051 {
1052   re_dfa_t *const dfa = mctx->dfa;
1053   reg_errcode_t err;
1054   Idx match = 0;
1055   Idx match_last = REG_MISSING;
1056   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1057   re_dfastate_t *cur_state;
1058   int at_init_state = p_match_first != NULL;
1059   Idx next_start_idx = cur_str_idx;
1060
1061   err = REG_NOERROR;
1062   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1063   /* An initial state must not be NULL (invalid).  */
1064   if (BE (cur_state == NULL, 0))
1065     {
1066       assert (err == REG_ESPACE);
1067       return REG_ERROR;
1068     }
1069
1070   if (mctx->state_log != NULL)
1071     {
1072       mctx->state_log[cur_str_idx] = cur_state;
1073
1074       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1075          later.  E.g. Processing back references.  */
1076       if (BE (dfa->nbackref, 0))
1077         {
1078           at_init_state = 0;
1079           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1080           if (BE (err != REG_NOERROR, 0))
1081             return err;
1082
1083           if (cur_state->has_backref)
1084             {
1085               err = transit_state_bkref (mctx, &cur_state->nodes);
1086               if (BE (err != REG_NOERROR, 0))
1087                 return err;
1088             }
1089         }
1090     }
1091
1092   /* If the RE accepts NULL string.  */
1093   if (BE (cur_state->halt, 0))
1094     {
1095       if (!cur_state->has_constraint
1096           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1097         {
1098           if (!fl_longest_match)
1099             return cur_str_idx;
1100           else
1101             {
1102               match_last = cur_str_idx;
1103               match = 1;
1104             }
1105         }
1106     }
1107
1108   while (!re_string_eoi (&mctx->input))
1109     {
1110       re_dfastate_t *old_state = cur_state;
1111       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1112
1113       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1114           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1115               && mctx->input.valid_len < mctx->input.len))
1116         {
1117           err = extend_buffers (mctx);
1118           if (BE (err != REG_NOERROR, 0))
1119             {
1120               assert (err == REG_ESPACE);
1121               return REG_ERROR;
1122             }
1123         }
1124
1125       cur_state = transit_state (&err, mctx, cur_state);
1126       if (mctx->state_log != NULL)
1127         cur_state = merge_state_with_log (&err, mctx, cur_state);
1128
1129       if (cur_state == NULL)
1130         {
1131           /* Reached the invalid state or an error.  Try to recover a valid
1132              state using the state log, if available and if we have not
1133              already found a valid (even if not the longest) match.  */
1134           if (BE (err != REG_NOERROR, 0))
1135             return REG_ERROR;
1136
1137           if (mctx->state_log == NULL
1138               || (match && !fl_longest_match)
1139               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1140             break;
1141         }
1142
1143       if (BE (at_init_state, 0))
1144         {
1145           if (old_state == cur_state)
1146             next_start_idx = next_char_idx;
1147           else
1148             at_init_state = 0;
1149         }
1150
1151       if (cur_state->halt)
1152         {
1153           /* Reached a halt state.
1154              Check the halt state can satisfy the current context.  */
1155           if (!cur_state->has_constraint
1156               || check_halt_state_context (mctx, cur_state,
1157                                            re_string_cur_idx (&mctx->input)))
1158             {
1159               /* We found an appropriate halt state.  */
1160               match_last = re_string_cur_idx (&mctx->input);
1161               match = 1;
1162
1163               /* We found a match, do not modify match_first below.  */
1164               p_match_first = NULL;
1165               if (!fl_longest_match)
1166                 break;
1167             }
1168         }
1169     }
1170
1171   if (p_match_first)
1172     *p_match_first += next_start_idx;
1173
1174   return match_last;
1175 }
1176
1177 /* Check NODE match the current context.  */
1178
1179 static int
1180 internal_function
1181 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1182 {
1183   re_token_type_t type = dfa->nodes[node].type;
1184   unsigned int constraint = dfa->nodes[node].constraint;
1185   if (type != END_OF_RE)
1186     return 0;
1187   if (!constraint)
1188     return 1;
1189   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1190     return 0;
1191   return 1;
1192 }
1193
1194 /* Check the halt state STATE match the current context.
1195    Return 0 if not match, if the node, STATE has, is a halt node and
1196    match the context, return the node.  */
1197
1198 static Idx
1199 internal_function
1200 check_halt_state_context (const re_match_context_t *mctx,
1201                           const re_dfastate_t *state, Idx idx)
1202 {
1203   Idx i;
1204   unsigned int context;
1205 #ifdef DEBUG
1206   assert (state->halt);
1207 #endif
1208   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1209   for (i = 0; i < state->nodes.nelem; ++i)
1210     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1211       return state->nodes.elems[i];
1212   return 0;
1213 }
1214
1215 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1216    corresponding to the DFA).
1217    Return the destination node, and update EPS_VIA_NODES;
1218    return REG_MISSING in case of errors.  */
1219
1220 static Idx
1221 internal_function
1222 proceed_next_node (const re_match_context_t *mctx,
1223                    Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1224                    re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1225 {
1226   re_dfa_t *const dfa = mctx->dfa;
1227   Idx i;
1228   int err;
1229   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1230     {
1231       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1232       re_node_set *edests = &dfa->edests[node];
1233       Idx dest_node;
1234       err = re_node_set_insert (eps_via_nodes, node);
1235       if (BE (err < 0, 0))
1236         return REG_ERROR;
1237       /* Pick up a valid destination, or return REG_MISSING if none
1238          is found.  */
1239       for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1240         {
1241           Idx candidate = edests->elems[i];
1242           if (!re_node_set_contains (cur_nodes, candidate))
1243             continue;
1244           if (dest_node == REG_MISSING)
1245             dest_node = candidate;
1246
1247           else
1248             {
1249               /* In order to avoid infinite loop like "(a*)*", return the second
1250                  epsilon-transition if the first was already considered.  */
1251               if (re_node_set_contains (eps_via_nodes, dest_node))
1252                 return candidate;
1253
1254               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1255               else if (fs != NULL
1256                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1257                                            eps_via_nodes))
1258                 return REG_ERROR;
1259
1260               /* We know we are going to exit.  */
1261               break;
1262             }
1263         }
1264       return dest_node;
1265     }
1266   else
1267     {
1268       Idx naccepted = 0;
1269       re_token_type_t type = dfa->nodes[node].type;
1270
1271 #ifdef RE_ENABLE_I18N
1272       if (dfa->nodes[node].accept_mb)
1273         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1274       else
1275 #endif /* RE_ENABLE_I18N */
1276       if (type == OP_BACK_REF)
1277         {
1278           Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1279           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1280           if (fs != NULL)
1281             {
1282               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1283                 return REG_MISSING;
1284               else if (naccepted)
1285                 {
1286                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1287                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1288                               naccepted) != 0)
1289                     return REG_MISSING;
1290                 }
1291             }
1292
1293           if (naccepted == 0)
1294             {
1295               Idx dest_node;
1296               err = re_node_set_insert (eps_via_nodes, node);
1297               if (BE (err < 0, 0))
1298                 return REG_ERROR;
1299               dest_node = dfa->edests[node].elems[0];
1300               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1301                                         dest_node))
1302                 return dest_node;
1303             }
1304         }
1305
1306       if (naccepted != 0
1307           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1308         {
1309           Idx dest_node = dfa->nexts[node];
1310           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1311           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1312                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1313                                                dest_node)))
1314             return REG_MISSING;
1315           re_node_set_empty (eps_via_nodes);
1316           return dest_node;
1317         }
1318     }
1319   return REG_MISSING;
1320 }
1321
1322 static reg_errcode_t
1323 internal_function
1324 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1325                  Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1326 {
1327   reg_errcode_t err;
1328   Idx num = fs->num++;
1329   if (fs->num == fs->alloc)
1330     {
1331       struct re_fail_stack_ent_t *new_array =
1332         re_realloc (fs->stack, struct re_fail_stack_ent_t, fs->alloc * 2);
1333       if (new_array == NULL)
1334         return REG_ESPACE;
1335       fs->alloc *= 2;
1336       fs->stack = new_array;
1337     }
1338   fs->stack[num].idx = str_idx;
1339   fs->stack[num].node = dest_node;
1340   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1341   if (fs->stack[num].regs == NULL)
1342     return REG_ESPACE;
1343   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1344   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1345   return err;
1346 }
1347
1348 static Idx
1349 internal_function
1350 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1351                 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1352 {
1353   Idx num = --fs->num;
1354   assert (REG_VALID_INDEX (num));
1355   *pidx = fs->stack[num].idx;
1356   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1357   re_node_set_free (eps_via_nodes);
1358   re_free (fs->stack[num].regs);
1359   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1360   return fs->stack[num].node;
1361 }
1362
1363 /* Set the positions where the subexpressions are starts/ends to registers
1364    PMATCH.
1365    Note: We assume that pmatch[0] is already set, and
1366    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1367
1368 static reg_errcode_t
1369 internal_function
1370 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1371           size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1372 {
1373   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1374   Idx idx, cur_node;
1375   re_node_set eps_via_nodes;
1376   struct re_fail_stack_t *fs;
1377   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1378   regmatch_t *prev_idx_match;
1379   int prev_idx_match_malloced = 0;
1380
1381 #ifdef DEBUG
1382   assert (nmatch > 1);
1383   assert (mctx->state_log != NULL);
1384 #endif
1385   if (fl_backtrack)
1386     {
1387       fs = &fs_body;
1388       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1389       if (fs->stack == NULL)
1390         return REG_ESPACE;
1391     }
1392   else
1393     fs = NULL;
1394
1395   cur_node = dfa->init_node;
1396   re_node_set_init_empty (&eps_via_nodes);
1397
1398   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1399     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1400   else
1401     {
1402       prev_idx_match = re_malloc (regmatch_t, nmatch);
1403       if (prev_idx_match == NULL)
1404         {
1405           free_fail_stack_return (fs);
1406           return REG_ESPACE;
1407         }
1408       prev_idx_match_malloced = 1;
1409     }
1410   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1411
1412   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1413     {
1414       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1415
1416       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1417         {
1418           Idx reg_idx;
1419           if (fs)
1420             {
1421               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1422                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1423                   break;
1424               if (reg_idx == nmatch)
1425                 {
1426                   re_node_set_free (&eps_via_nodes);
1427                   if (prev_idx_match_malloced)
1428                     re_free (prev_idx_match);
1429                   return free_fail_stack_return (fs);
1430                 }
1431               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1432                                          &eps_via_nodes);
1433             }
1434           else
1435             {
1436               re_node_set_free (&eps_via_nodes);
1437               if (prev_idx_match_malloced)
1438                 re_free (prev_idx_match);
1439               return REG_NOERROR;
1440             }
1441         }
1442
1443       /* Proceed to next node.  */
1444       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1445                                     &eps_via_nodes, fs);
1446
1447       if (BE (! REG_VALID_INDEX (cur_node), 0))
1448         {
1449           if (BE (cur_node == REG_ERROR, 0))
1450             {
1451               re_node_set_free (&eps_via_nodes);
1452               if (prev_idx_match_malloced)
1453                 re_free (prev_idx_match);
1454               free_fail_stack_return (fs);
1455               return REG_ESPACE;
1456             }
1457           if (fs)
1458             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1459                                        &eps_via_nodes);
1460           else
1461             {
1462               re_node_set_free (&eps_via_nodes);
1463               if (prev_idx_match_malloced)
1464                 re_free (prev_idx_match);
1465               return REG_NOMATCH;
1466             }
1467         }
1468     }
1469   re_node_set_free (&eps_via_nodes);
1470   if (prev_idx_match_malloced)
1471     re_free (prev_idx_match);
1472   return free_fail_stack_return (fs);
1473 }
1474
1475 static reg_errcode_t
1476 internal_function
1477 free_fail_stack_return (struct re_fail_stack_t *fs)
1478 {
1479   if (fs)
1480     {
1481       Idx fs_idx;
1482       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1483         {
1484           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1485           re_free (fs->stack[fs_idx].regs);
1486         }
1487       re_free (fs->stack);
1488     }
1489   return REG_NOERROR;
1490 }
1491
1492 static void
1493 internal_function
1494 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1495              Idx cur_node, Idx cur_idx, Idx nmatch)
1496 {
1497   int type = dfa->nodes[cur_node].type;
1498   if (type == OP_OPEN_SUBEXP)
1499     {
1500       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1501
1502       /* We are at the first node of this sub expression.  */
1503       if (reg_num < nmatch)
1504         {
1505           pmatch[reg_num].rm_so = cur_idx;
1506           pmatch[reg_num].rm_eo = -1;
1507         }
1508     }
1509   else if (type == OP_CLOSE_SUBEXP)
1510     {
1511       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1512       if (reg_num < nmatch)
1513         {
1514           /* We are at the last node of this sub expression.  */
1515           if (pmatch[reg_num].rm_so < cur_idx)
1516             {
1517               pmatch[reg_num].rm_eo = cur_idx;
1518               /* This is a non-empty match or we are not inside an optional
1519                  subexpression.  Accept this right away.  */
1520               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1521             }
1522           else
1523             {
1524               if (dfa->nodes[cur_node].opt_subexp
1525                   && prev_idx_match[reg_num].rm_so != -1)
1526                 /* We transited through an empty match for an optional
1527                    subexpression, like (a?)*, and this is not the subexp's
1528                    first match.  Copy back the old content of the registers
1529                    so that matches of an inner subexpression are undone as
1530                    well, like in ((a?))*.  */
1531                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1532               else
1533                 /* We completed a subexpression, but it may be part of
1534                    an optional one, so do not update PREV_IDX_MATCH.  */
1535                 pmatch[reg_num].rm_eo = cur_idx;
1536             }
1537         }
1538     }
1539 }
1540
1541 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1542    and sift the nodes in each states according to the following rules.
1543    Updated state_log will be wrote to STATE_LOG.
1544
1545    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1546      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1547         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1548         the LAST_NODE, we throw away the node `a'.
1549      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1550         string `s' and transit to `b':
1551         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1552            away the node `a'.
1553         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1554             thrown away, we throw away the node `a'.
1555      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1556         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1557            node `a'.
1558         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1559             we throw away the node `a'.  */
1560
1561 #define STATE_NODE_CONTAINS(state,node) \
1562   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1563
1564 static reg_errcode_t
1565 internal_function
1566 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1567 {
1568   reg_errcode_t err;
1569   int null_cnt = 0;
1570   Idx str_idx = sctx->last_str_idx;
1571   re_node_set cur_dest;
1572
1573 #ifdef DEBUG
1574   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1575 #endif
1576
1577   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1578      transit to the last_node and the last_node itself.  */
1579   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1580   if (BE (err != REG_NOERROR, 0))
1581     return err;
1582   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1583   if (BE (err != REG_NOERROR, 0))
1584     goto free_return;
1585
1586   /* Then check each states in the state_log.  */
1587   while (str_idx > 0)
1588     {
1589       /* Update counters.  */
1590       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1591       if (null_cnt > mctx->max_mb_elem_len)
1592         {
1593           memset (sctx->sifted_states, '\0',
1594                   sizeof (re_dfastate_t *) * str_idx);
1595           re_node_set_free (&cur_dest);
1596           return REG_NOERROR;
1597         }
1598       re_node_set_empty (&cur_dest);
1599       --str_idx;
1600
1601       if (mctx->state_log[str_idx])
1602         {
1603           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1604           if (BE (err != REG_NOERROR, 0))
1605             goto free_return;
1606         }
1607
1608       /* Add all the nodes which satisfy the following conditions:
1609          - It can epsilon transit to a node in CUR_DEST.
1610          - It is in CUR_SRC.
1611          And update state_log.  */
1612       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1613       if (BE (err != REG_NOERROR, 0))
1614         goto free_return;
1615     }
1616   err = REG_NOERROR;
1617  free_return:
1618   re_node_set_free (&cur_dest);
1619   return err;
1620 }
1621
1622 static reg_errcode_t
1623 internal_function
1624 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1625                      Idx str_idx, re_node_set *cur_dest)
1626 {
1627   re_dfa_t *const dfa = mctx->dfa;
1628   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1629   Idx i;
1630
1631   /* Then build the next sifted state.
1632      We build the next sifted state on `cur_dest', and update
1633      `sifted_states[str_idx]' with `cur_dest'.
1634      Note:
1635      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1636      `cur_src' points the node_set of the old `state_log[str_idx]'
1637      (with the epsilon nodes pre-filtered out).  */
1638   for (i = 0; i < cur_src->nelem; i++)
1639     {
1640       Idx prev_node = cur_src->elems[i];
1641       int naccepted = 0;
1642       int ret;
1643
1644 #ifdef DEBUG
1645       re_token_type_t type = dfa->nodes[prev_node].type;
1646       assert (!IS_EPSILON_NODE (type));
1647 #endif
1648 #ifdef RE_ENABLE_I18N
1649       /* If the node may accept `multi byte'.  */
1650       if (dfa->nodes[prev_node].accept_mb)
1651         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1652                                          str_idx, sctx->last_str_idx);
1653 #endif /* RE_ENABLE_I18N */
1654
1655       /* We don't check backreferences here.
1656          See update_cur_sifted_state().  */
1657       if (!naccepted
1658           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1659           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1660                                   dfa->nexts[prev_node]))
1661         naccepted = 1;
1662
1663       if (naccepted == 0)
1664         continue;
1665
1666       if (sctx->limits.nelem)
1667         {
1668           Idx to_idx = str_idx + naccepted;
1669           if (check_dst_limits (mctx, &sctx->limits,
1670                                 dfa->nexts[prev_node], to_idx,
1671                                 prev_node, str_idx))
1672             continue;
1673         }
1674       ret = re_node_set_insert (cur_dest, prev_node);
1675       if (BE (ret == -1, 0))
1676         return REG_ESPACE;
1677     }
1678
1679   return REG_NOERROR;
1680 }
1681
1682 /* Helper functions.  */
1683
1684 static reg_errcode_t
1685 internal_function
1686 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1687 {
1688   Idx top = mctx->state_log_top;
1689
1690   if (next_state_log_idx >= mctx->input.bufs_len
1691       || (next_state_log_idx >= mctx->input.valid_len
1692           && mctx->input.valid_len < mctx->input.len))
1693     {
1694       reg_errcode_t err;
1695       err = extend_buffers (mctx);
1696       if (BE (err != REG_NOERROR, 0))
1697         return err;
1698     }
1699
1700   if (top < next_state_log_idx)
1701     {
1702       memset (mctx->state_log + top + 1, '\0',
1703               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1704       mctx->state_log_top = next_state_log_idx;
1705     }
1706   return REG_NOERROR;
1707 }
1708
1709 static reg_errcode_t
1710 internal_function
1711 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1712                    Idx num)
1713 {
1714   Idx st_idx;
1715   reg_errcode_t err;
1716   for (st_idx = 0; st_idx < num; ++st_idx)
1717     {
1718       if (dst[st_idx] == NULL)
1719         dst[st_idx] = src[st_idx];
1720       else if (src[st_idx] != NULL)
1721         {
1722           re_node_set merged_set;
1723           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1724                                         &src[st_idx]->nodes);
1725           if (BE (err != REG_NOERROR, 0))
1726             return err;
1727           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1728           re_node_set_free (&merged_set);
1729           if (BE (err != REG_NOERROR, 0))
1730             return err;
1731         }
1732     }
1733   return REG_NOERROR;
1734 }
1735
1736 static reg_errcode_t
1737 internal_function
1738 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1739                          Idx str_idx, re_node_set *dest_nodes)
1740 {
1741   re_dfa_t *const dfa = mctx->dfa;
1742   reg_errcode_t err;
1743   const re_node_set *candidates;
1744   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1745                 : &mctx->state_log[str_idx]->nodes);
1746
1747   if (dest_nodes->nelem == 0)
1748     sctx->sifted_states[str_idx] = NULL;
1749   else
1750     {
1751       if (candidates)
1752         {
1753           /* At first, add the nodes which can epsilon transit to a node in
1754              DEST_NODE.  */
1755           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1756           if (BE (err != REG_NOERROR, 0))
1757             return err;
1758
1759           /* Then, check the limitations in the current sift_context.  */
1760           if (sctx->limits.nelem)
1761             {
1762               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1763                                          mctx->bkref_ents, str_idx);
1764               if (BE (err != REG_NOERROR, 0))
1765                 return err;
1766             }
1767         }
1768
1769       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1770       if (BE (err != REG_NOERROR, 0))
1771         return err;
1772     }
1773
1774   if (candidates && mctx->state_log[str_idx]->has_backref)
1775     {
1776       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1777       if (BE (err != REG_NOERROR, 0))
1778         return err;
1779     }
1780   return REG_NOERROR;
1781 }
1782
1783 static reg_errcode_t
1784 internal_function
1785 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1786                        const re_node_set *candidates)
1787 {
1788   reg_errcode_t err = REG_NOERROR;
1789   Idx i;
1790
1791   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1792   if (BE (err != REG_NOERROR, 0))
1793     return err;
1794
1795   if (!state->inveclosure.alloc)
1796     {
1797       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1798       if (BE (err != REG_NOERROR, 0))
1799         return REG_ESPACE;
1800       for (i = 0; i < dest_nodes->nelem; i++)
1801         re_node_set_merge (&state->inveclosure,
1802                            dfa->inveclosures + dest_nodes->elems[i]);
1803     }
1804   return re_node_set_add_intersect (dest_nodes, candidates,
1805                                     &state->inveclosure);
1806 }
1807
1808 static reg_errcode_t
1809 internal_function
1810 sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1811                        const re_node_set *candidates)
1812 {
1813     Idx ecl_idx;
1814     reg_errcode_t err;
1815     re_node_set *inv_eclosure = dfa->inveclosures + node;
1816     re_node_set except_nodes;
1817     re_node_set_init_empty (&except_nodes);
1818     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1819       {
1820         Idx cur_node = inv_eclosure->elems[ecl_idx];
1821         if (cur_node == node)
1822           continue;
1823         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1824           {
1825             Idx edst1 = dfa->edests[cur_node].elems[0];
1826             Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1827                          ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1828             if ((!re_node_set_contains (inv_eclosure, edst1)
1829                  && re_node_set_contains (dest_nodes, edst1))
1830                 || (REG_VALID_NONZERO_INDEX (edst2)
1831                     && !re_node_set_contains (inv_eclosure, edst2)
1832                     && re_node_set_contains (dest_nodes, edst2)))
1833               {
1834                 err = re_node_set_add_intersect (&except_nodes, candidates,
1835                                                  dfa->inveclosures + cur_node);
1836                 if (BE (err != REG_NOERROR, 0))
1837                   {
1838                     re_node_set_free (&except_nodes);
1839                     return err;
1840                   }
1841               }
1842           }
1843       }
1844     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1845       {
1846         Idx cur_node = inv_eclosure->elems[ecl_idx];
1847         if (!re_node_set_contains (&except_nodes, cur_node))
1848           {
1849             Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1850             re_node_set_remove_at (dest_nodes, idx);
1851           }
1852       }
1853     re_node_set_free (&except_nodes);
1854     return REG_NOERROR;
1855 }
1856
1857 static int
1858 internal_function
1859 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1860                   Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1861 {
1862   re_dfa_t *const dfa = mctx->dfa;
1863   Idx lim_idx, src_pos, dst_pos;
1864
1865   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1866   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1867   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1868     {
1869       Idx subexp_idx;
1870       struct re_backref_cache_entry *ent;
1871       ent = mctx->bkref_ents + limits->elems[lim_idx];
1872       subexp_idx = dfa->nodes[ent->node].opr.idx;
1873
1874       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1875                                            subexp_idx, dst_node, dst_idx,
1876                                            dst_bkref_idx);
1877       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1878                                            subexp_idx, src_node, src_idx,
1879                                            src_bkref_idx);
1880
1881       /* In case of:
1882          <src> <dst> ( <subexp> )
1883          ( <subexp> ) <src> <dst>
1884          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1885       if (src_pos == dst_pos)
1886         continue; /* This is unrelated limitation.  */
1887       else
1888         return 1;
1889     }
1890   return 0;
1891 }
1892
1893 static int
1894 internal_function
1895 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1896                              Idx subexp_idx, Idx from_node, Idx bkref_idx)
1897 {
1898   re_dfa_t *const dfa = mctx->dfa;
1899   re_node_set *eclosures = dfa->eclosures + from_node;
1900   Idx node_idx;
1901
1902   /* Else, we are on the boundary: examine the nodes on the epsilon
1903      closure.  */
1904   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1905     {
1906       Idx node = eclosures->elems[node_idx];
1907       switch (dfa->nodes[node].type)
1908         {
1909         case OP_BACK_REF:
1910           if (bkref_idx != REG_MISSING)
1911             {
1912               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1913               do
1914                 {
1915                   Idx dst;
1916                   int cpos;
1917
1918                   if (ent->node != node)
1919                     continue;
1920
1921                   if (subexp_idx
1922                       < CHAR_BIT * sizeof ent->eps_reachable_subexps_map
1923                       && !(ent->eps_reachable_subexps_map & (1u << subexp_idx)))
1924                     continue;
1925
1926                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1927                      OP_CLOSE_SUBEXP cases below.  But, if the
1928                      destination node is the same node as the source
1929                      node, don't recurse because it would cause an
1930                      infinite loop: a regex that exhibits this behavior
1931                      is ()\1*\1*  */
1932                   dst = dfa->edests[node].elems[0];
1933                   if (dst == from_node)
1934                     {
1935                       if (boundaries & 1)
1936                         return -1;
1937                       else /* if (boundaries & 2) */
1938                         return 0;
1939                     }
1940
1941                   cpos =
1942                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1943                                                  dst, bkref_idx);
1944                   if (cpos == -1 /* && (boundaries & 1) */)
1945                     return -1;
1946                   if (cpos == 0 && (boundaries & 2))
1947                     return 0;
1948
1949                   if (subexp_idx
1950                       < CHAR_BIT * sizeof ent->eps_reachable_subexps_map)
1951                     ent->eps_reachable_subexps_map &= ~(1u << subexp_idx);
1952                 }
1953               while (ent++->more);
1954             }
1955           break;
1956
1957         case OP_OPEN_SUBEXP:
1958           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1959             return -1;
1960           break;
1961
1962         case OP_CLOSE_SUBEXP:
1963           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1964             return 0;
1965           break;
1966
1967         default:
1968             break;
1969         }
1970     }
1971
1972   return (boundaries & 2) ? 1 : 0;
1973 }
1974
1975 static int
1976 internal_function
1977 check_dst_limits_calc_pos (const re_match_context_t *mctx,
1978                            Idx limit, Idx subexp_idx,
1979                            Idx from_node, Idx str_idx, Idx bkref_idx)
1980 {
1981   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1982   int boundaries;
1983
1984   /* If we are outside the range of the subexpression, return -1 or 1.  */
1985   if (str_idx < lim->subexp_from)
1986     return -1;
1987
1988   if (lim->subexp_to < str_idx)
1989     return 1;
1990
1991   /* If we are within the subexpression, return 0.  */
1992   boundaries = (str_idx == lim->subexp_from);
1993   boundaries |= (str_idx == lim->subexp_to) << 1;
1994   if (boundaries == 0)
1995     return 0;
1996
1997   /* Else, examine epsilon closure.  */
1998   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1999                                       from_node, bkref_idx);
2000 }
2001
2002 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2003    which are against limitations from DEST_NODES. */
2004
2005 static reg_errcode_t
2006 internal_function
2007 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2008                      const re_node_set *candidates, re_node_set *limits,
2009                      struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2010 {
2011   reg_errcode_t err;
2012   Idx node_idx, lim_idx;
2013
2014   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2015     {
2016       Idx subexp_idx;
2017       struct re_backref_cache_entry *ent;
2018       ent = bkref_ents + limits->elems[lim_idx];
2019
2020       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2021         continue; /* This is unrelated limitation.  */
2022
2023       subexp_idx = dfa->nodes[ent->node].opr.idx;
2024       if (ent->subexp_to == str_idx)
2025         {
2026           Idx ops_node = REG_MISSING;
2027           Idx cls_node = REG_MISSING;
2028           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2029             {
2030               Idx node = dest_nodes->elems[node_idx];
2031               re_token_type_t type = dfa->nodes[node].type;
2032               if (type == OP_OPEN_SUBEXP
2033                   && subexp_idx == dfa->nodes[node].opr.idx)
2034                 ops_node = node;
2035               else if (type == OP_CLOSE_SUBEXP
2036                        && subexp_idx == dfa->nodes[node].opr.idx)
2037                 cls_node = node;
2038             }
2039
2040           /* Check the limitation of the open subexpression.  */
2041           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2042           if (REG_VALID_INDEX (ops_node))
2043             {
2044               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2045                                            candidates);
2046               if (BE (err != REG_NOERROR, 0))
2047                 return err;
2048             }
2049
2050           /* Check the limitation of the close subexpression.  */
2051           if (REG_VALID_INDEX (cls_node))
2052             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2053               {
2054                 Idx node = dest_nodes->elems[node_idx];
2055                 if (!re_node_set_contains (dfa->inveclosures + node,
2056                                            cls_node)
2057                     && !re_node_set_contains (dfa->eclosures + node,
2058                                               cls_node))
2059                   {
2060                     /* It is against this limitation.
2061                        Remove it form the current sifted state.  */
2062                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2063                                                  candidates);
2064                     if (BE (err != REG_NOERROR, 0))
2065                       return err;
2066                     --node_idx;
2067                   }
2068               }
2069         }
2070       else /* (ent->subexp_to != str_idx)  */
2071         {
2072           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2073             {
2074               Idx node = dest_nodes->elems[node_idx];
2075               re_token_type_t type = dfa->nodes[node].type;
2076               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2077                 {
2078                   if (subexp_idx != dfa->nodes[node].opr.idx)
2079                     continue;
2080                   /* It is against this limitation.
2081                      Remove it form the current sifted state.  */
2082                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2083                                                candidates);
2084                   if (BE (err != REG_NOERROR, 0))
2085                     return err;
2086                 }
2087             }
2088         }
2089     }
2090   return REG_NOERROR;
2091 }
2092
2093 static reg_errcode_t
2094 internal_function
2095 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2096                    Idx str_idx, const re_node_set *candidates)
2097 {
2098   re_dfa_t *const dfa = mctx->dfa;
2099   reg_errcode_t err;
2100   Idx node_idx, node;
2101   re_sift_context_t local_sctx;
2102   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2103
2104   if (first_idx == REG_MISSING)
2105     return REG_NOERROR;
2106
2107   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2108
2109   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2110     {
2111       Idx enabled_idx;
2112       re_token_type_t type;
2113       struct re_backref_cache_entry *entry;
2114       node = candidates->elems[node_idx];
2115       type = dfa->nodes[node].type;
2116       /* Avoid infinite loop for the REs like "()\1+".  */
2117       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2118         continue;
2119       if (type != OP_BACK_REF)
2120         continue;
2121
2122       entry = mctx->bkref_ents + first_idx;
2123       enabled_idx = first_idx;
2124       do
2125         {
2126           Idx subexp_len, to_idx, dst_node, ret;
2127           re_dfastate_t *cur_state;
2128
2129           if (entry->node != node)
2130             continue;
2131           subexp_len = entry->subexp_to - entry->subexp_from;
2132           to_idx = str_idx + subexp_len;
2133           dst_node = (subexp_len ? dfa->nexts[node]
2134                       : dfa->edests[node].elems[0]);
2135
2136           if (to_idx > sctx->last_str_idx
2137               || sctx->sifted_states[to_idx] == NULL
2138               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2139               || check_dst_limits (mctx, &sctx->limits, node,
2140                                    str_idx, dst_node, to_idx))
2141             continue;
2142
2143           if (local_sctx.sifted_states == NULL)
2144             {
2145               local_sctx = *sctx;
2146               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2147               if (BE (err != REG_NOERROR, 0))
2148                 goto free_return;
2149             }
2150           local_sctx.last_node = node;
2151           local_sctx.last_str_idx = str_idx;
2152           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2153           if (BE (ret < 0, 0))
2154             {
2155               err = REG_ESPACE;
2156               goto free_return;
2157             }
2158           cur_state = local_sctx.sifted_states[str_idx];
2159           err = sift_states_backward (mctx, &local_sctx);
2160           if (BE (err != REG_NOERROR, 0))
2161             goto free_return;
2162           if (sctx->limited_states != NULL)
2163             {
2164               err = merge_state_array (dfa, sctx->limited_states,
2165                                        local_sctx.sifted_states,
2166                                        str_idx + 1);
2167               if (BE (err != REG_NOERROR, 0))
2168                 goto free_return;
2169             }
2170           local_sctx.sifted_states[str_idx] = cur_state;
2171           re_node_set_remove (&local_sctx.limits, enabled_idx);
2172
2173           /* mctx->bkref_ents may have changed, reload the pointer.  */
2174           entry = mctx->bkref_ents + enabled_idx;
2175         }
2176       while (enabled_idx++, entry++->more);
2177     }
2178   err = REG_NOERROR;
2179  free_return:
2180   if (local_sctx.sifted_states != NULL)
2181     {
2182       re_node_set_free (&local_sctx.limits);
2183     }
2184
2185   return err;
2186 }
2187
2188
2189 #ifdef RE_ENABLE_I18N
2190 static int
2191 internal_function
2192 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2193                      Idx node_idx, Idx str_idx, Idx max_str_idx)
2194 {
2195   re_dfa_t *const dfa = mctx->dfa;
2196   int naccepted;
2197   /* Check the node can accept `multi byte'.  */
2198   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2199   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2200       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2201                             dfa->nexts[node_idx]))
2202     /* The node can't accept the `multi byte', or the
2203        destination was already thrown away, then the node
2204        could't accept the current input `multi byte'.   */
2205     naccepted = 0;
2206   /* Otherwise, it is sure that the node could accept
2207      `naccepted' bytes input.  */
2208   return naccepted;
2209 }
2210 #endif /* RE_ENABLE_I18N */
2211
2212 \f
2213 /* Functions for state transition.  */
2214
2215 /* Return the next state to which the current state STATE will transit by
2216    accepting the current input byte, and update STATE_LOG if necessary.
2217    If STATE can accept a multibyte char/collating element/back reference
2218    update the destination of STATE_LOG.  */
2219
2220 static re_dfastate_t *
2221 internal_function
2222 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2223                re_dfastate_t *state)
2224 {
2225   re_dfastate_t **trtable;
2226   unsigned char ch;
2227
2228 #ifdef RE_ENABLE_I18N
2229   /* If the current state can accept multibyte.  */
2230   if (BE (state->accept_mb, 0))
2231     {
2232       *err = transit_state_mb (mctx, state);
2233       if (BE (*err != REG_NOERROR, 0))
2234         return NULL;
2235     }
2236 #endif /* RE_ENABLE_I18N */
2237
2238   /* Then decide the next state with the single byte.  */
2239 #if 0
2240   if (0)
2241     /* don't use transition table  */
2242     return transit_state_sb (err, mctx, state);
2243 #endif
2244
2245   /* Use transition table  */
2246   ch = re_string_fetch_byte (&mctx->input);
2247   for (;;)
2248     {
2249       trtable = state->trtable;
2250       if (BE (trtable != NULL, 1))
2251         return trtable[ch];
2252
2253       trtable = state->word_trtable;
2254       if (BE (trtable != NULL, 1))
2255         {
2256           unsigned int context;
2257           context
2258             = re_string_context_at (&mctx->input,
2259                                     re_string_cur_idx (&mctx->input) - 1,
2260                                     mctx->eflags);
2261           if (IS_WORD_CONTEXT (context))
2262             return trtable[ch + SBC_MAX];
2263           else
2264             return trtable[ch];
2265         }
2266
2267       if (!build_trtable (mctx->dfa, state))
2268         {
2269           *err = REG_ESPACE;
2270           return NULL;
2271         }
2272
2273       /* Retry, we now have a transition table.  */
2274     }
2275 }
2276
2277 /* Update the state_log if we need */
2278 re_dfastate_t *
2279 internal_function
2280 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2281                       re_dfastate_t *next_state)
2282 {
2283   re_dfa_t *const dfa = mctx->dfa;
2284   Idx cur_idx = re_string_cur_idx (&mctx->input);
2285
2286   if (cur_idx > mctx->state_log_top)
2287     {
2288       mctx->state_log[cur_idx] = next_state;
2289       mctx->state_log_top = cur_idx;
2290     }
2291   else if (mctx->state_log[cur_idx] == 0)
2292     {
2293       mctx->state_log[cur_idx] = next_state;
2294     }
2295   else
2296     {
2297       re_dfastate_t *pstate;
2298       unsigned int context;
2299       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2300       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2301          the destination of a multibyte char/collating element/
2302          back reference.  Then the next state is the union set of
2303          these destinations and the results of the transition table.  */
2304       pstate = mctx->state_log[cur_idx];
2305       log_nodes = pstate->entrance_nodes;
2306       if (next_state != NULL)
2307         {
2308           table_nodes = next_state->entrance_nodes;
2309           *err = re_node_set_init_union (&next_nodes, table_nodes,
2310                                              log_nodes);
2311           if (BE (*err != REG_NOERROR, 0))
2312             return NULL;
2313         }
2314       else
2315         next_nodes = *log_nodes;
2316       /* Note: We already add the nodes of the initial state,
2317          then we don't need to add them here.  */
2318
2319       context = re_string_context_at (&mctx->input,
2320                                       re_string_cur_idx (&mctx->input) - 1,
2321                                       mctx->eflags);
2322       next_state = mctx->state_log[cur_idx]
2323         = re_acquire_state_context (err, dfa, &next_nodes, context);
2324       /* We don't need to check errors here, since the return value of
2325          this function is next_state and ERR is already set.  */
2326
2327       if (table_nodes != NULL)
2328         re_node_set_free (&next_nodes);
2329     }
2330
2331   if (BE (dfa->nbackref, 0) && next_state != NULL)
2332     {
2333       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2334          later.  We must check them here, since the back references in the
2335          next state might use them.  */
2336       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2337                                         cur_idx);
2338       if (BE (*err != REG_NOERROR, 0))
2339         return NULL;
2340
2341       /* If the next state has back references.  */
2342       if (next_state->has_backref)
2343         {
2344           *err = transit_state_bkref (mctx, &next_state->nodes);
2345           if (BE (*err != REG_NOERROR, 0))
2346             return NULL;
2347           next_state = mctx->state_log[cur_idx];
2348         }
2349     }
2350
2351   return next_state;
2352 }
2353
2354 /* Skip bytes in the input that correspond to part of a
2355    multi-byte match, then look in the log for a state
2356    from which to restart matching.  */
2357 static re_dfastate_t *
2358 internal_function
2359 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2360 {
2361   re_dfastate_t *cur_state = NULL;
2362   do
2363     {
2364       Idx max = mctx->state_log_top;
2365       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2366
2367       do
2368         {
2369           if (++cur_str_idx > max)
2370             return NULL;
2371           re_string_skip_bytes (&mctx->input, 1);
2372         }
2373       while (mctx->state_log[cur_str_idx] == NULL);
2374
2375       cur_state = merge_state_with_log (err, mctx, NULL);
2376     }
2377   while (*err == REG_NOERROR && cur_state == NULL);
2378   return cur_state;
2379 }
2380
2381 /* Helper functions for transit_state.  */
2382
2383 /* From the node set CUR_NODES, pick up the nodes whose types are
2384    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2385    expression. And register them to use them later for evaluating the
2386    correspoding back references.  */
2387
2388 static reg_errcode_t
2389 internal_function
2390 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2391                            Idx str_idx)
2392 {
2393   re_dfa_t *const dfa = mctx->dfa;
2394   Idx node_idx;
2395   reg_errcode_t err;
2396
2397   /* TODO: This isn't efficient.
2398            Because there might be more than one nodes whose types are
2399            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2400            nodes.
2401            E.g. RE: (a){2}  */
2402   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2403     {
2404       Idx node = cur_nodes->elems[node_idx];
2405       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2406           && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map
2407           && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx))
2408         {
2409           err = match_ctx_add_subtop (mctx, node, str_idx);
2410           if (BE (err != REG_NOERROR, 0))
2411             return err;
2412         }
2413     }
2414   return REG_NOERROR;
2415 }
2416
2417 #if 0
2418 /* Return the next state to which the current state STATE will transit by
2419    accepting the current input byte.  */
2420
2421 static re_dfastate_t *
2422 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2423                   re_dfastate_t *state)
2424 {
2425   re_dfa_t *const dfa = mctx->dfa;
2426   re_node_set next_nodes;
2427   re_dfastate_t *next_state;
2428   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2429   unsigned int context;
2430
2431   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432   if (BE (*err != REG_NOERROR, 0))
2433     return NULL;
2434   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2435     {
2436       Idx cur_node = state->nodes.elems[node_cnt];
2437       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2438         {
2439           *err = re_node_set_merge (&next_nodes,
2440                                     dfa->eclosures + dfa->nexts[cur_node]);
2441           if (BE (*err != REG_NOERROR, 0))
2442             {
2443               re_node_set_free (&next_nodes);
2444               return NULL;
2445             }
2446         }
2447     }
2448   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2449   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450   /* We don't need to check errors here, since the return value of
2451      this function is next_state and ERR is already set.  */
2452
2453   re_node_set_free (&next_nodes);
2454   re_string_skip_bytes (&mctx->input, 1);
2455   return next_state;
2456 }
2457 #endif
2458
2459 #ifdef RE_ENABLE_I18N
2460 static reg_errcode_t
2461 internal_function
2462 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2463 {
2464   re_dfa_t *const dfa = mctx->dfa;
2465   reg_errcode_t err;
2466   Idx i;
2467
2468   for (i = 0; i < pstate->nodes.nelem; ++i)
2469     {
2470       re_node_set dest_nodes, *new_nodes;
2471       Idx cur_node_idx = pstate->nodes.elems[i];
2472       int naccepted;
2473       Idx dest_idx;
2474       unsigned int context;
2475       re_dfastate_t *dest_state;
2476
2477       if (!dfa->nodes[cur_node_idx].accept_mb)
2478         continue;
2479
2480       if (dfa->nodes[cur_node_idx].constraint)
2481         {
2482           context = re_string_context_at (&mctx->input,
2483                                           re_string_cur_idx (&mctx->input),
2484                                           mctx->eflags);
2485           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2486                                            context))
2487             continue;
2488         }
2489
2490       /* How many bytes the node can accept?  */
2491       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2492                                            re_string_cur_idx (&mctx->input));
2493       if (naccepted == 0)
2494         continue;
2495
2496       /* The node can accepts `naccepted' bytes.  */
2497       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2498       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2499                                : mctx->max_mb_elem_len);
2500       err = clean_state_log_if_needed (mctx, dest_idx);
2501       if (BE (err != REG_NOERROR, 0))
2502         return err;
2503 #ifdef DEBUG
2504       assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2505 #endif
2506       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2507
2508       dest_state = mctx->state_log[dest_idx];
2509       if (dest_state == NULL)
2510         dest_nodes = *new_nodes;
2511       else
2512         {
2513           err = re_node_set_init_union (&dest_nodes,
2514                                         dest_state->entrance_nodes, new_nodes);
2515           if (BE (err != REG_NOERROR, 0))
2516             return err;
2517         }
2518       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2519       mctx->state_log[dest_idx]
2520         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521       if (dest_state != NULL)
2522         re_node_set_free (&dest_nodes);
2523       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2524         return err;
2525     }
2526   return REG_NOERROR;
2527 }
2528 #endif /* RE_ENABLE_I18N */
2529
2530 static reg_errcode_t
2531 internal_function
2532 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2533 {
2534   re_dfa_t *const dfa = mctx->dfa;
2535   reg_errcode_t err;
2536   Idx i;
2537   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2538
2539   for (i = 0; i < nodes->nelem; ++i)
2540     {
2541       Idx dest_str_idx, prev_nelem, bkc_idx;
2542       Idx node_idx = nodes->elems[i];
2543       unsigned int context;
2544       const re_token_t *node = dfa->nodes + node_idx;
2545       re_node_set *new_dest_nodes;
2546
2547       /* Check whether `node' is a backreference or not.  */
2548       if (node->type != OP_BACK_REF)
2549         continue;
2550
2551       if (node->constraint)
2552         {
2553           context = re_string_context_at (&mctx->input, cur_str_idx,
2554                                           mctx->eflags);
2555           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2556             continue;
2557         }
2558
2559       /* `node' is a backreference.
2560          Check the substring which the substring matched.  */
2561       bkc_idx = mctx->nbkref_ents;
2562       err = get_subexp (mctx, node_idx, cur_str_idx);
2563       if (BE (err != REG_NOERROR, 0))
2564         goto free_return;
2565
2566       /* And add the epsilon closures (which is `new_dest_nodes') of
2567          the backreference to appropriate state_log.  */
2568 #ifdef DEBUG
2569       assert (dfa->nexts[node_idx] != REG_MISSING);
2570 #endif
2571       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2572         {
2573           Idx subexp_len;
2574           re_dfastate_t *dest_state;
2575           struct re_backref_cache_entry *bkref_ent;
2576           bkref_ent = mctx->bkref_ents + bkc_idx;
2577           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2578             continue;
2579           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2580           new_dest_nodes = (subexp_len == 0
2581                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2582                             : dfa->eclosures + dfa->nexts[node_idx]);
2583           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2584                           - bkref_ent->subexp_from);
2585           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2586                                           mctx->eflags);
2587           dest_state = mctx->state_log[dest_str_idx];
2588           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2589                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2590           /* Add `new_dest_node' to state_log.  */
2591           if (dest_state == NULL)
2592             {
2593               mctx->state_log[dest_str_idx]
2594                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2595                                             context);
2596               if (BE (mctx->state_log[dest_str_idx] == NULL
2597                       && err != REG_NOERROR, 0))
2598                 goto free_return;
2599             }
2600           else
2601             {
2602               re_node_set dest_nodes;
2603               err = re_node_set_init_union (&dest_nodes,
2604                                             dest_state->entrance_nodes,
2605                                             new_dest_nodes);
2606               if (BE (err != REG_NOERROR, 0))
2607                 {
2608                   re_node_set_free (&dest_nodes);
2609                   goto free_return;
2610                 }
2611               mctx->state_log[dest_str_idx]
2612                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2613               re_node_set_free (&dest_nodes);
2614               if (BE (mctx->state_log[dest_str_idx] == NULL
2615                       && err != REG_NOERROR, 0))
2616                 goto free_return;
2617             }
2618           /* We need to check recursively if the backreference can epsilon
2619              transit.  */
2620           if (subexp_len == 0
2621               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2622             {
2623               err = check_subexp_matching_top (mctx, new_dest_nodes,
2624                                                cur_str_idx);
2625               if (BE (err != REG_NOERROR, 0))
2626                 goto free_return;
2627               err = transit_state_bkref (mctx, new_dest_nodes);
2628               if (BE (err != REG_NOERROR, 0))
2629                 goto free_return;
2630             }
2631         }
2632     }
2633   err = REG_NOERROR;
2634  free_return:
2635   return err;
2636 }
2637
2638 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2639    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2640    Note that we might collect inappropriate candidates here.
2641    However, the cost of checking them strictly here is too high, then we
2642    delay these checking for prune_impossible_nodes().  */
2643
2644 static reg_errcode_t
2645 internal_function
2646 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2647 {
2648   re_dfa_t *const dfa = mctx->dfa;
2649   Idx subexp_num, sub_top_idx;
2650   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2651   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2652   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2653   if (cache_idx != REG_MISSING)
2654     {
2655       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2656       do
2657         if (entry->node == bkref_node)
2658           return REG_NOERROR; /* We already checked it.  */
2659       while (entry++->more);
2660     }
2661
2662   subexp_num = dfa->nodes[bkref_node].opr.idx;
2663
2664   /* For each sub expression  */
2665   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2666     {
2667       reg_errcode_t err;
2668       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2669       re_sub_match_last_t *sub_last;
2670       Idx sub_last_idx, sl_str, bkref_str_off;
2671
2672       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2673         continue; /* It isn't related.  */
2674
2675       sl_str = sub_top->str_idx;
2676       bkref_str_off = bkref_str_idx;
2677       /* At first, check the last node of sub expressions we already
2678          evaluated.  */
2679       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2680         {
2681           regoff_t sl_str_diff;
2682           sub_last = sub_top->lasts[sub_last_idx];
2683           sl_str_diff = sub_last->str_idx - sl_str;
2684           /* The matched string by the sub expression match with the substring
2685              at the back reference?  */
2686           if (sl_str_diff > 0)
2687             {
2688               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2689                 {
2690                   /* Not enough chars for a successful match.  */
2691                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2692                     break;
2693
2694                   err = clean_state_log_if_needed (mctx,
2695                                                    bkref_str_off
2696                                                    + sl_str_diff);
2697                   if (BE (err != REG_NOERROR, 0))
2698                     return err;
2699                   buf = (const char *) re_string_get_buffer (&mctx->input);
2700                 }
2701               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2702                 break; /* We don't need to search this sub expression any more.  */
2703             }
2704           bkref_str_off += sl_str_diff;
2705           sl_str += sl_str_diff;
2706           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2707                                 bkref_str_idx);
2708
2709           /* Reload buf, since the preceding call might have reallocated
2710              the buffer.  */
2711           buf = (const char *) re_string_get_buffer (&mctx->input);
2712
2713           if (err == REG_NOMATCH)
2714             continue;
2715           if (BE (err != REG_NOERROR, 0))
2716             return err;
2717         }
2718
2719       if (sub_last_idx < sub_top->nlasts)
2720         continue;
2721       if (sub_last_idx > 0)
2722         ++sl_str;
2723       /* Then, search for the other last nodes of the sub expression.  */
2724       for (; sl_str <= bkref_str_idx; ++sl_str)
2725         {
2726           Idx cls_node;
2727           regoff_t sl_str_off;
2728           const re_node_set *nodes;
2729           sl_str_off = sl_str - sub_top->str_idx;
2730           /* The matched string by the sub expression match with the substring
2731              at the back reference?  */
2732           if (sl_str_off > 0)
2733             {
2734               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2735                 {
2736                   /* If we are at the end of the input, we cannot match.  */
2737                   if (bkref_str_off >= mctx->input.len)
2738                     break;
2739
2740                   err = extend_buffers (mctx);
2741                   if (BE (err != REG_NOERROR, 0))
2742                     return err;
2743
2744                   buf = (const char *) re_string_get_buffer (&mctx->input);
2745                 }
2746               if (buf [bkref_str_off++] != buf[sl_str - 1])
2747                 break; /* We don't need to search this sub expression
2748                           any more.  */
2749             }
2750           if (mctx->state_log[sl_str] == NULL)
2751             continue;
2752           /* Does this state have a ')' of the sub expression?  */
2753           nodes = &mctx->state_log[sl_str]->nodes;
2754           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2755           if (cls_node == REG_MISSING)
2756             continue; /* No.  */
2757           if (sub_top->path == NULL)
2758             {
2759               sub_top->path = re_calloc (state_array_t,
2760                                          sl_str - sub_top->str_idx + 1);
2761               if (sub_top->path == NULL)
2762                 return REG_ESPACE;
2763             }
2764           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2765              in the current context?  */
2766           err = check_arrival (mctx, sub_top->path, sub_top->node,
2767                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2768           if (err == REG_NOMATCH)
2769               continue;
2770           if (BE (err != REG_NOERROR, 0))
2771               return err;
2772           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2773           if (BE (sub_last == NULL, 0))
2774             return REG_ESPACE;
2775           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2776                                 bkref_str_idx);
2777           if (err == REG_NOMATCH)
2778             continue;
2779         }
2780     }
2781   return REG_NOERROR;
2782 }
2783
2784 /* Helper functions for get_subexp().  */
2785
2786 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2787    If it can arrive, register the sub expression expressed with SUB_TOP
2788    and SUB_LAST.  */
2789
2790 static reg_errcode_t
2791 internal_function
2792 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2793                 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2794 {
2795   reg_errcode_t err;
2796   Idx to_idx;
2797   /* Can the subexpression arrive the back reference?  */
2798   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2799                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2800   if (err != REG_NOERROR)
2801     return err;
2802   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2803                              sub_last->str_idx);
2804   if (BE (err != REG_NOERROR, 0))
2805     return err;
2806   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2807   return clean_state_log_if_needed (mctx, to_idx);
2808 }
2809
2810 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2811    Search '(' if FL_OPEN, or search ')' otherwise.
2812    TODO: This function isn't efficient...
2813          Because there might be more than one nodes whose types are
2814          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2815          nodes.
2816          E.g. RE: (a){2}  */
2817
2818 static Idx
2819 internal_function
2820 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2821                   Idx subexp_idx, int type)
2822 {
2823   Idx cls_idx;
2824   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2825     {
2826       Idx cls_node = nodes->elems[cls_idx];
2827       const re_token_t *node = dfa->nodes + cls_node;
2828       if (node->type == type
2829           && node->opr.idx == subexp_idx)
2830         return cls_node;
2831     }
2832   return REG_MISSING;
2833 }
2834
2835 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2836    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2837    heavily reused.
2838    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2839
2840 static reg_errcode_t
2841 internal_function
2842 check_arrival (re_match_context_t *mctx, state_array_t *path,
2843                Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2844                int type)
2845 {
2846   re_dfa_t *const dfa = mctx->dfa;
2847   reg_errcode_t err;
2848   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2849   re_dfastate_t *cur_state = NULL;
2850   re_node_set *cur_nodes, next_nodes;
2851   re_dfastate_t **backup_state_log;
2852   unsigned int context;
2853
2854   subexp_num = dfa->nodes[top_node].opr.idx;
2855   /* Extend the buffer if we need.  */
2856   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2857     {
2858       re_dfastate_t **new_array;
2859       Idx old_alloc = path->alloc;
2860       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2861       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2862       if (new_array == NULL)
2863         {
2864           path->alloc = old_alloc;
2865           return REG_ESPACE;
2866         }
2867       path->array = new_array;
2868       memset (new_array + old_alloc, '\0',
2869               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2870     }
2871
2872   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2873
2874   /* Temporary modify MCTX.  */
2875   backup_state_log = mctx->state_log;
2876   backup_cur_idx = mctx->input.cur_idx;
2877   mctx->state_log = path->array;
2878   mctx->input.cur_idx = str_idx;
2879
2880   /* Setup initial node set.  */
2881   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2882   if (str_idx == top_str)
2883     {
2884       err = re_node_set_init_1 (&next_nodes, top_node);
2885       if (BE (err != REG_NOERROR, 0))
2886         return err;
2887       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2888       if (BE (err != REG_NOERROR, 0))
2889         {
2890           re_node_set_free (&next_nodes);
2891           return err;
2892         }
2893     }
2894   else
2895     {
2896       cur_state = mctx->state_log[str_idx];
2897       if (cur_state && cur_state->has_backref)
2898         {
2899           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2900           if (BE ( err != REG_NOERROR, 0))
2901             return err;
2902         }
2903       else
2904         re_node_set_init_empty (&next_nodes);
2905     }
2906   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2907     {
2908       if (next_nodes.nelem)
2909         {
2910           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2911                                     subexp_num, type);
2912           if (BE ( err != REG_NOERROR, 0))
2913             {
2914               re_node_set_free (&next_nodes);
2915               return err;
2916             }
2917         }
2918       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2919       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2920         {
2921           re_node_set_free (&next_nodes);
2922           return err;
2923         }
2924       mctx->state_log[str_idx] = cur_state;
2925     }
2926
2927   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2928     {
2929       re_node_set_empty (&next_nodes);
2930       if (mctx->state_log[str_idx + 1])
2931         {
2932           err = re_node_set_merge (&next_nodes,
2933                                    &mctx->state_log[str_idx + 1]->nodes);
2934           if (BE (err != REG_NOERROR, 0))
2935             {
2936               re_node_set_free (&next_nodes);
2937               return err;
2938             }
2939         }
2940       if (cur_state)
2941         {
2942           err = check_arrival_add_next_nodes (mctx, str_idx,
2943                                               &cur_state->non_eps_nodes, &next_nodes);
2944           if (BE (err != REG_NOERROR, 0))
2945             {
2946               re_node_set_free (&next_nodes);
2947               return err;
2948             }
2949         }
2950       ++str_idx;
2951       if (next_nodes.nelem)
2952         {
2953           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2954           if (BE (err != REG_NOERROR, 0))
2955             {
2956               re_node_set_free (&next_nodes);
2957               return err;
2958             }
2959           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2960                                     subexp_num, type);
2961           if (BE ( err != REG_NOERROR, 0))
2962             {
2963               re_node_set_free (&next_nodes);
2964               return err;
2965             }
2966         }
2967       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2968       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2969       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2970         {
2971           re_node_set_free (&next_nodes);
2972           return err;
2973         }
2974       mctx->state_log[str_idx] = cur_state;
2975       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2976     }
2977   re_node_set_free (&next_nodes);
2978   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2979                : &mctx->state_log[last_str]->nodes);
2980   path->next_idx = str_idx;
2981
2982   /* Fix MCTX.  */
2983   mctx->state_log = backup_state_log;
2984   mctx->input.cur_idx = backup_cur_idx;
2985
2986   /* Then check the current node set has the node LAST_NODE.  */
2987   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2988     return REG_NOERROR;
2989
2990   return REG_NOMATCH;
2991 }
2992
2993 /* Helper functions for check_arrival.  */
2994
2995 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2996    to NEXT_NODES.
2997    TODO: This function is similar to the functions transit_state*(),
2998          however this function has many additional works.
2999          Can't we unify them?  */
3000
3001 static reg_errcode_t
3002 internal_function
3003 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3004                               re_node_set *cur_nodes,
3005                               re_node_set *next_nodes)
3006 {
3007   re_dfa_t *const dfa = mctx->dfa;
3008   int result;
3009   Idx cur_idx;
3010   reg_errcode_t err;
3011   re_node_set union_set;
3012   re_node_set_init_empty (&union_set);
3013   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3014     {
3015       int naccepted = 0;
3016       Idx cur_node = cur_nodes->elems[cur_idx];
3017 #ifdef DEBUG
3018       re_token_type_t type = dfa->nodes[cur_node].type;
3019       assert (!IS_EPSILON_NODE (type));
3020 #endif
3021 #ifdef RE_ENABLE_I18N
3022       /* If the node may accept `multi byte'.  */
3023       if (dfa->nodes[cur_node].accept_mb)
3024         {
3025           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3026                                                str_idx);
3027           if (naccepted > 1)
3028             {
3029               re_dfastate_t *dest_state;
3030               Idx next_node = dfa->nexts[cur_node];
3031               Idx next_idx = str_idx + naccepted;
3032               dest_state = mctx->state_log[next_idx];
3033               re_node_set_empty (&union_set);
3034               if (dest_state)
3035                 {
3036                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3037                   if (BE (err != REG_NOERROR, 0))
3038                     {
3039                       re_node_set_free (&union_set);
3040                       return err;
3041                     }
3042                 }
3043               result = re_node_set_insert (&union_set, next_node);
3044               if (BE (result < 0, 0))
3045                 {
3046                   re_node_set_free (&union_set);
3047                   return REG_ESPACE;
3048                 }
3049               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3050                                                             &union_set);
3051               if (BE (mctx->state_log[next_idx] == NULL
3052                       && err != REG_NOERROR, 0))
3053                 {
3054                   re_node_set_free (&union_set);
3055                   return err;
3056                 }
3057             }
3058         }
3059 #endif /* RE_ENABLE_I18N */
3060       if (naccepted
3061           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3062         {
3063           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3064           if (BE (result < 0, 0))
3065             {
3066               re_node_set_free (&union_set);
3067               return REG_ESPACE;
3068             }
3069         }
3070     }
3071   re_node_set_free (&union_set);
3072   return REG_NOERROR;
3073 }
3074
3075 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3076    CUR_NODES, however exclude the nodes which are:
3077     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3078     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3079 */
3080
3081 static reg_errcode_t
3082 internal_function
3083 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3084                           Idx ex_subexp, int type)
3085 {
3086   reg_errcode_t err;
3087   Idx idx, outside_node;
3088   re_node_set new_nodes;
3089 #ifdef DEBUG
3090   assert (cur_nodes->nelem);
3091 #endif
3092   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3093   if (BE (err != REG_NOERROR, 0))
3094     return err;
3095   /* Create a new node set NEW_NODES with the nodes which are epsilon
3096      closures of the node in CUR_NODES.  */
3097
3098   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3099     {
3100       Idx cur_node = cur_nodes->elems[idx];
3101       re_node_set *eclosure = dfa->eclosures + cur_node;
3102       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3103       if (outside_node == REG_MISSING)
3104         {
3105           /* There are no problematic nodes, just merge them.  */
3106           err = re_node_set_merge (&new_nodes, eclosure);
3107           if (BE (err != REG_NOERROR, 0))
3108             {
3109               re_node_set_free (&new_nodes);
3110               return err;
3111             }
3112         }
3113       else
3114         {
3115           /* There are problematic nodes, re-calculate incrementally.  */
3116           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3117                                               ex_subexp, type);
3118           if (BE (err != REG_NOERROR, 0))
3119             {
3120               re_node_set_free (&new_nodes);
3121               return err;
3122             }
3123         }
3124     }
3125   re_node_set_free (cur_nodes);
3126   *cur_nodes = new_nodes;
3127   return REG_NOERROR;
3128 }
3129
3130 /* Helper function for check_arrival_expand_ecl.
3131    Check incrementally the epsilon closure of TARGET, and if it isn't
3132    problematic append it to DST_NODES.  */
3133
3134 static reg_errcode_t
3135 internal_function
3136 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3137                               Idx target, Idx ex_subexp, int type)
3138 {
3139   Idx cur_node;
3140   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3141     {
3142       int err;
3143
3144       if (dfa->nodes[cur_node].type == type
3145           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3146         {
3147           if (type == OP_CLOSE_SUBEXP)
3148             {
3149               err = re_node_set_insert (dst_nodes, cur_node);
3150               if (BE (err == -1, 0))
3151                 return REG_ESPACE;
3152             }
3153           break;
3154         }
3155       err = re_node_set_insert (dst_nodes, cur_node);
3156       if (BE (err == -1, 0))
3157         return REG_ESPACE;
3158       if (dfa->edests[cur_node].nelem == 0)
3159         break;
3160       if (dfa->edests[cur_node].nelem == 2)
3161         {
3162           reg_errcode_t ret =
3163             check_arrival_expand_ecl_sub (dfa, dst_nodes,
3164                                           dfa->edests[cur_node].elems[1],
3165                                           ex_subexp, type);
3166           if (BE (ret != REG_NOERROR, 0))
3167             return ret;
3168         }
3169       cur_node = dfa->edests[cur_node].elems[0];
3170     }
3171   return REG_NOERROR;
3172 }
3173
3174
3175 /* For all the back references in the current state, calculate the
3176    destination of the back references by the appropriate entry
3177    in MCTX->BKREF_ENTS.  */
3178
3179 static reg_errcode_t
3180 internal_function
3181 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3182                     Idx cur_str, Idx subexp_num, int type)
3183 {
3184   re_dfa_t *const dfa = mctx->dfa;
3185   reg_errcode_t err;
3186   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3187   struct re_backref_cache_entry *ent;
3188
3189   if (cache_idx_start == REG_MISSING)
3190     return REG_NOERROR;
3191
3192  restart:
3193   ent = mctx->bkref_ents + cache_idx_start;
3194   do
3195     {
3196       Idx to_idx, next_node;
3197
3198       /* Is this entry ENT is appropriate?  */
3199       if (!re_node_set_contains (cur_nodes, ent->node))
3200         continue; /* No.  */
3201
3202       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3203       /* Calculate the destination of the back reference, and append it
3204          to MCTX->STATE_LOG.  */
3205       if (to_idx == cur_str)
3206         {
3207           /* The backreference did epsilon transit, we must re-check all the
3208              node in the current state.  */
3209           re_node_set new_dests;
3210           reg_errcode_t err2, err3;
3211           next_node = dfa->edests[ent->node].elems[0];
3212           if (re_node_set_contains (cur_nodes, next_node))
3213             continue;
3214           err = re_node_set_init_1 (&new_dests, next_node);
3215           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3216           err3 = re_node_set_merge (cur_nodes, &new_dests);
3217           re_node_set_free (&new_dests);
3218           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3219                   || err3 != REG_NOERROR, 0))
3220             {
3221               err = (err != REG_NOERROR ? err
3222                      : (err2 != REG_NOERROR ? err2 : err3));
3223               return err;
3224             }
3225           /* TODO: It is still inefficient...  */
3226           goto restart;
3227         }
3228       else
3229         {
3230           re_node_set union_set;
3231           next_node = dfa->nexts[ent->node];
3232           if (mctx->state_log[to_idx])
3233             {
3234               int ret;
3235               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3236                                         next_node))
3237                 continue;
3238               err = re_node_set_init_copy (&union_set,
3239                                            &mctx->state_log[to_idx]->nodes);
3240               ret = re_node_set_insert (&union_set, next_node);
3241               if (BE (err != REG_NOERROR || ret < 0, 0))
3242                 {
3243                   re_node_set_free (&union_set);
3244                   err = err != REG_NOERROR ? err : REG_ESPACE;
3245                   return err;
3246                 }
3247             }
3248           else
3249             {
3250               err = re_node_set_init_1 (&union_set, next_node);
3251               if (BE (err != REG_NOERROR, 0))
3252                 return err;
3253             }
3254           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3255           re_node_set_free (&union_set);
3256           if (BE (mctx->state_log[to_idx] == NULL
3257                   && err != REG_NOERROR, 0))
3258             return err;
3259         }
3260     }
3261   while (ent++->more);
3262   return REG_NOERROR;
3263 }
3264
3265 /* Build transition table for the state.
3266    Return 1 if succeeded, otherwise return NULL.  */
3267
3268 static int
3269 internal_function
3270 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3271 {
3272   reg_errcode_t err;
3273   Idx i, j;
3274   int ch, need_word_trtable = 0;
3275   unsigned int elem, mask;
3276   int dests_node_malloced = 0, dest_states_malloced = 0;
3277   Idx ndests; /* Number of the destination states from `state'.  */
3278   re_dfastate_t **trtable;
3279   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3280   re_node_set follows, *dests_node;
3281   bitset *dests_ch;
3282   bitset acceptable;
3283
3284   /* We build DFA states which corresponds to the destination nodes
3285      from `state'.  `dests_node[i]' represents the nodes which i-th
3286      destination state contains, and `dests_ch[i]' represents the
3287      characters which i-th destination state accepts.  */
3288   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3289     dests_node = (re_node_set *)
3290       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3291   else
3292     {
3293       dests_node = (re_node_set *)
3294         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3295       if (BE (dests_node == NULL, 0))
3296         return 0;
3297       dests_node_malloced = 1;
3298     }
3299   dests_ch = (bitset *) (dests_node + SBC_MAX);
3300
3301   /* Initialize transiton table.  */
3302   state->word_trtable = state->trtable = NULL;
3303
3304   /* At first, group all nodes belonging to `state' into several
3305      destinations.  */
3306   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3307   if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3308     {
3309       if (dests_node_malloced)
3310         free (dests_node);
3311       /* Return 0 in case of an error, 1 otherwise.  */
3312       if (ndests == 0)
3313         {
3314           state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3315           return 1;
3316         }
3317       return 0;
3318     }
3319
3320   err = re_node_set_alloc (&follows, ndests + 1);
3321   if (BE (err != REG_NOERROR, 0))
3322     goto out_free;
3323
3324   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3325                          + ndests * 3 * sizeof (re_dfastate_t *)))
3326     dest_states = (re_dfastate_t **)
3327       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3328   else
3329     {
3330       dest_states = (re_dfastate_t **)
3331         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3332       if (BE (dest_states == NULL, 0))
3333         {
3334 out_free:
3335           if (dest_states_malloced)
3336             free (dest_states);
3337           re_node_set_free (&follows);
3338           for (i = 0; i < ndests; ++i)
3339             re_node_set_free (dests_node + i);
3340           if (dests_node_malloced)
3341             free (dests_node);
3342           return 0;
3343         }
3344       dest_states_malloced = 1;
3345     }
3346   dest_states_word = dest_states + ndests;
3347   dest_states_nl = dest_states_word + ndests;
3348   bitset_empty (acceptable);
3349
3350   /* Then build the states for all destinations.  */
3351   for (i = 0; i < ndests; ++i)
3352     {
3353       Idx next_node;
3354       re_node_set_empty (&follows);
3355       /* Merge the follows of this destination states.  */
3356       for (j = 0; j < dests_node[i].nelem; ++j)
3357         {
3358           next_node = dfa->nexts[dests_node[i].elems[j]];
3359           if (next_node != REG_MISSING)
3360             {
3361               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3362               if (BE (err != REG_NOERROR, 0))
3363                 goto out_free;
3364             }
3365         }
3366       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3367       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3368         goto out_free;
3369       /* If the new state has context constraint,
3370          build appropriate states for these contexts.  */
3371       if (dest_states[i]->has_constraint)
3372         {
3373           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3374                                                           CONTEXT_WORD);
3375           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3376             goto out_free;
3377
3378           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3379             need_word_trtable = 1;
3380
3381           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3382                                                         CONTEXT_NEWLINE);
3383           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3384             goto out_free;
3385         }
3386       else
3387         {
3388           dest_states_word[i] = dest_states[i];
3389           dest_states_nl[i] = dest_states[i];
3390         }
3391       bitset_merge (acceptable, dests_ch[i]);
3392     }
3393
3394   if (!BE (need_word_trtable, 0))
3395     {
3396       /* We don't care about whether the following character is a word
3397          character, or we are in a single-byte character set so we can
3398          discern by looking at the character code: allocate a
3399          256-entry transition table.  */
3400       trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3401       if (BE (trtable == NULL, 0))
3402         goto out_free;
3403
3404       /* For all characters ch...:  */
3405       for (i = 0; i < BITSET_UINTS; ++i)
3406         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3407              elem;
3408              mask <<= 1, elem >>= 1, ++ch)
3409           if (BE (elem & 1, 0))
3410             {
3411               /* There must be exactly one destination which accepts
3412                  character ch.  See group_nodes_into_DFAstates.  */
3413               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3414                 ;
3415
3416               /* j-th destination accepts the word character ch.  */
3417               if (dfa->word_char[i] & mask)
3418                 trtable[ch] = dest_states_word[j];
3419               else
3420                 trtable[ch] = dest_states[j];
3421             }
3422     }
3423   else
3424     {
3425       /* We care about whether the following character is a word
3426          character, and we are in a multi-byte character set: discern
3427          by looking at the character code: build two 256-entry
3428          transition tables, one starting at trtable[0] and one
3429          starting at trtable[SBC_MAX].  */
3430       trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3431       if (BE (trtable == NULL, 0))
3432         goto out_free;
3433
3434       /* For all characters ch...:  */
3435       for (i = 0; i < BITSET_UINTS; ++i)
3436         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3437              elem;
3438              mask <<= 1, elem >>= 1, ++ch)
3439           if (BE (elem & 1, 0))
3440             {
3441               /* There must be exactly one destination which accepts
3442                  character ch.  See group_nodes_into_DFAstates.  */
3443               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3444                 ;
3445
3446               /* j-th destination accepts the word character ch.  */
3447               trtable[ch] = dest_states[j];
3448               trtable[ch + SBC_MAX] = dest_states_word[j];
3449             }
3450     }
3451
3452   /* new line */
3453   if (bitset_contain (acceptable, NEWLINE_CHAR))
3454     {
3455       /* The current state accepts newline character.  */
3456       for (j = 0; j < ndests; ++j)
3457         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3458           {
3459             /* k-th destination accepts newline character.  */
3460             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3461             if (need_word_trtable)
3462               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3463             /* There must be only one destination which accepts
3464                newline.  See group_nodes_into_DFAstates.  */
3465             break;
3466           }
3467     }
3468
3469   if (dest_states_malloced)
3470     free (dest_states);
3471
3472   re_node_set_free (&follows);
3473   for (i = 0; i < ndests; ++i)
3474     re_node_set_free (dests_node + i);
3475
3476   if (dests_node_malloced)
3477     free (dests_node);
3478
3479   return 1;
3480 }
3481
3482 /* Group all nodes belonging to STATE into several destinations.
3483    Then for all destinations, set the nodes belonging to the destination
3484    to DESTS_NODE[i] and set the characters accepted by the destination
3485    to DEST_CH[i].  This function return the number of destinations.  */
3486
3487 static Idx
3488 internal_function
3489 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3490                             re_node_set *dests_node, bitset *dests_ch)
3491 {
3492   reg_errcode_t err;
3493   int result;
3494   Idx i, j, k;
3495   Idx ndests; /* Number of the destinations from `state'.  */
3496   bitset accepts; /* Characters a node can accept.  */
3497   const re_node_set *cur_nodes = &state->nodes;
3498   bitset_empty (accepts);
3499   ndests = 0;
3500
3501   /* For all the nodes belonging to `state',  */
3502   for (i = 0; i < cur_nodes->nelem; ++i)
3503     {
3504       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3505       re_token_type_t type = node->type;
3506       unsigned int constraint = node->constraint;
3507
3508       /* Enumerate all single byte character this node can accept.  */
3509       if (type == CHARACTER)
3510         bitset_set (accepts, node->opr.c);
3511       else if (type == SIMPLE_BRACKET)
3512         {
3513           bitset_merge (accepts, node->opr.sbcset);
3514         }
3515       else if (type == OP_PERIOD)
3516         {
3517 #ifdef RE_ENABLE_I18N
3518           if (dfa->mb_cur_max > 1)
3519             bitset_merge (accepts, dfa->sb_char);
3520           else
3521 #endif
3522             bitset_set_all (accepts);
3523           if (!(dfa->syntax & REG_DOT_NEWLINE))
3524             bitset_clear (accepts, '\n');
3525           if (dfa->syntax & REG_DOT_NOT_NULL)
3526             bitset_clear (accepts, '\0');
3527         }
3528 #ifdef RE_ENABLE_I18N
3529       else if (type == OP_UTF8_PERIOD)
3530         {
3531           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3532           if (!(dfa->syntax & REG_DOT_NEWLINE))
3533             bitset_clear (accepts, '\n');
3534           if (dfa->syntax & REG_DOT_NOT_NULL)
3535             bitset_clear (accepts, '\0');
3536         }
3537 #endif
3538       else
3539         continue;
3540
3541       /* Check the `accepts' and sift the characters which are not
3542          match it the context.  */
3543       if (constraint)
3544         {
3545           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3546             {
3547               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3548               bitset_empty (accepts);
3549               if (accepts_newline)
3550                 bitset_set (accepts, NEWLINE_CHAR);
3551               else
3552                 continue;
3553             }
3554           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3555             {
3556               bitset_empty (accepts);
3557               continue;
3558             }
3559
3560           if (constraint & NEXT_WORD_CONSTRAINT)
3561             {
3562               unsigned int any_set = 0;
3563               if (type == CHARACTER && !node->word_char)
3564                 {
3565                   bitset_empty (accepts);
3566                   continue;
3567                 }
3568 #ifdef RE_ENABLE_I18N
3569               if (dfa->mb_cur_max > 1)
3570                 for (j = 0; j < BITSET_UINTS; ++j)
3571                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3572               else
3573 #endif
3574                 for (j = 0; j < BITSET_UINTS; ++j)
3575                   any_set |= (accepts[j] &= dfa->word_char[j]);
3576               if (!any_set)
3577                 continue;
3578             }
3579           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3580             {
3581               unsigned int any_set = 0;
3582               if (type == CHARACTER && node->word_char)
3583                 {
3584                   bitset_empty (accepts);
3585                   continue;
3586                 }
3587 #ifdef RE_ENABLE_I18N
3588               if (dfa->mb_cur_max > 1)
3589                 for (j = 0; j < BITSET_UINTS; ++j)
3590                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3591               else
3592 #endif
3593                 for (j = 0; j < BITSET_UINTS; ++j)
3594                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3595               if (!any_set)
3596                 continue;
3597             }
3598         }
3599
3600       /* Then divide `accepts' into DFA states, or create a new
3601          state.  Above, we make sure that accepts is not empty.  */
3602       for (j = 0; j < ndests; ++j)
3603         {
3604           bitset intersec; /* Intersection sets, see below.  */
3605           bitset remains;
3606           /* Flags, see below.  */
3607           int has_intersec, not_subset, not_consumed;
3608
3609           /* Optimization, skip if this state doesn't accept the character.  */
3610           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3611             continue;
3612
3613           /* Enumerate the intersection set of this state and `accepts'.  */
3614           has_intersec = 0;
3615           for (k = 0; k < BITSET_UINTS; ++k)
3616             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3617           /* And skip if the intersection set is empty.  */
3618           if (!has_intersec)
3619             continue;
3620
3621           /* Then check if this state is a subset of `accepts'.  */
3622           not_subset = not_consumed = 0;
3623           for (k = 0; k < BITSET_UINTS; ++k)
3624             {
3625               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3626               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3627             }
3628
3629           /* If this state isn't a subset of `accepts', create a
3630              new group state, which has the `remains'. */
3631           if (not_subset)
3632             {
3633               bitset_copy (dests_ch[ndests], remains);
3634               bitset_copy (dests_ch[j], intersec);
3635               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3636               if (BE (err != REG_NOERROR, 0))
3637                 goto error_return;
3638               ++ndests;
3639             }
3640
3641           /* Put the position in the current group. */
3642           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3643           if (BE (result < 0, 0))
3644             goto error_return;
3645
3646           /* If all characters are consumed, go to next node. */
3647           if (!not_consumed)
3648             break;
3649         }
3650       /* Some characters remain, create a new group. */
3651       if (j == ndests)
3652         {
3653           bitset_copy (dests_ch[ndests], accepts);
3654           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3655           if (BE (err != REG_NOERROR, 0))
3656             goto error_return;
3657           ++ndests;
3658           bitset_empty (accepts);
3659         }
3660     }
3661   return ndests;
3662  error_return:
3663   for (j = 0; j < ndests; ++j)
3664     re_node_set_free (dests_node + j);
3665   return REG_MISSING;
3666 }
3667
3668 #ifdef RE_ENABLE_I18N
3669 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3670    Return the number of the bytes the node accepts.
3671    STR_IDX is the current index of the input string.
3672
3673    This function handles the nodes which can accept one character, or
3674    one collating element like '.', '[a-z]', opposite to the other nodes
3675    can only accept one byte.  */
3676
3677 static int
3678 internal_function
3679 check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3680                          const re_string_t *input, Idx str_idx)
3681 {
3682   const re_token_t *node = dfa->nodes + node_idx;
3683   int char_len, elem_len;
3684   Idx i;
3685
3686   if (BE (node->type == OP_UTF8_PERIOD, 0))
3687     {
3688       unsigned char c = re_string_byte_at (input, str_idx), d;
3689       if (BE (c < 0xc2, 1))
3690         return 0;
3691
3692       if (str_idx + 2 > input->len)
3693         return 0;
3694
3695       d = re_string_byte_at (input, str_idx + 1);
3696       if (c < 0xe0)
3697         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3698       else if (c < 0xf0)
3699         {
3700           char_len = 3;
3701           if (c == 0xe0 && d < 0xa0)
3702             return 0;
3703         }
3704       else if (c < 0xf8)
3705         {
3706           char_len = 4;
3707           if (c == 0xf0 && d < 0x90)
3708             return 0;
3709         }
3710       else if (c < 0xfc)
3711         {
3712           char_len = 5;
3713           if (c == 0xf8 && d < 0x88)
3714             return 0;
3715         }
3716       else if (c < 0xfe)
3717         {
3718           char_len = 6;
3719           if (c == 0xfc && d < 0x84)
3720             return 0;
3721         }
3722       else
3723         return 0;
3724
3725       if (str_idx + char_len > input->len)
3726         return 0;
3727
3728       for (i = 1; i < char_len; ++i)
3729         {
3730           d = re_string_byte_at (input, str_idx + i);
3731           if (d < 0x80 || d > 0xbf)
3732             return 0;
3733         }
3734       return char_len;
3735     }
3736
3737   char_len = re_string_char_size_at (input, str_idx);
3738   if (node->type == OP_PERIOD)
3739     {
3740       if (char_len <= 1)
3741         return 0;
3742       /* FIXME: I don't think this if is needed, as both '\n'
3743          and '\0' are char_len == 1.  */
3744       /* '.' accepts any one character except the following two cases.  */
3745       if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3746            re_string_byte_at (input, str_idx) == '\n') ||
3747           ((dfa->syntax & REG_DOT_NOT_NULL) &&
3748            re_string_byte_at (input, str_idx) == '\0'))
3749         return 0;
3750       return char_len;
3751     }
3752
3753   elem_len = re_string_elem_size_at (input, str_idx);
3754   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3755     return 0;
3756
3757   if (node->type == COMPLEX_BRACKET)
3758     {
3759       const re_charset_t *cset = node->opr.mbcset;
3760 # ifdef _LIBC
3761       const unsigned char *pin
3762         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3763       Idx j;
3764       uint32_t nrules;
3765 # endif /* _LIBC */
3766       int match_len = 0;
3767       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3768                     ? re_string_wchar_at (input, str_idx) : 0);
3769
3770       /* match with multibyte character?  */
3771       for (i = 0; i < cset->nmbchars; ++i)
3772         if (wc == cset->mbchars[i])
3773           {
3774             match_len = char_len;
3775             goto check_node_accept_bytes_match;
3776           }
3777       /* match with character_class?  */
3778       for (i = 0; i < cset->nchar_classes; ++i)
3779         {
3780           wctype_t wt = cset->char_classes[i];
3781           if (__iswctype (wc, wt))
3782             {
3783               match_len = char_len;
3784               goto check_node_accept_bytes_match;
3785             }
3786         }
3787
3788 # ifdef _LIBC
3789       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3790       if (nrules != 0)
3791         {
3792           unsigned int in_collseq = 0;
3793           const int32_t *table, *indirect;
3794           const unsigned char *weights, *extra;
3795           const char *collseqwc;
3796           int32_t idx;
3797           /* This #include defines a local function!  */
3798 #  include <locale/weight.h>
3799
3800           /* match with collating_symbol?  */
3801           if (cset->ncoll_syms)
3802             extra = (const unsigned char *)
3803               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3804           for (i = 0; i < cset->ncoll_syms; ++i)
3805             {
3806               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3807               /* Compare the length of input collating element and
3808                  the length of current collating element.  */
3809               if (*coll_sym != elem_len)
3810                 continue;
3811               /* Compare each bytes.  */
3812               for (j = 0; j < *coll_sym; j++)
3813                 if (pin[j] != coll_sym[1 + j])
3814                   break;
3815               if (j == *coll_sym)
3816                 {
3817                   /* Match if every bytes is equal.  */
3818                   match_len = j;
3819                   goto check_node_accept_bytes_match;
3820                 }
3821             }
3822
3823           if (cset->nranges)
3824             {
3825               if (elem_len <= char_len)
3826                 {
3827                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3828                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3829                 }
3830               else
3831                 in_collseq = find_collation_sequence_value (pin, elem_len);
3832             }
3833           /* match with range expression?  */
3834           for (i = 0; i < cset->nranges; ++i)
3835             if (cset->range_starts[i] <= in_collseq
3836                 && in_collseq <= cset->range_ends[i])
3837               {
3838                 match_len = elem_len;
3839                 goto check_node_accept_bytes_match;
3840               }
3841
3842           /* match with equivalence_class?  */
3843           if (cset->nequiv_classes)
3844             {
3845               const unsigned char *cp = pin;
3846               table = (const int32_t *)
3847                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3848               weights = (const unsigned char *)
3849                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3850               extra = (const unsigned char *)
3851                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3852               indirect = (const int32_t *)
3853                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3854               idx = findidx (&cp);
3855               if (idx > 0)
3856                 for (i = 0; i < cset->nequiv_classes; ++i)
3857                   {
3858                     int32_t equiv_class_idx = cset->equiv_classes[i];
3859                     size_t weight_len = weights[idx];
3860                     if (weight_len == weights[equiv_class_idx])
3861                       {
3862                         Idx cnt = 0;
3863                         while (cnt <= weight_len
3864                                && (weights[equiv_class_idx + 1 + cnt]
3865                                    == weights[idx + 1 + cnt]))
3866                           ++cnt;
3867                         if (cnt > weight_len)
3868                           {
3869                             match_len = elem_len;
3870                             goto check_node_accept_bytes_match;
3871                           }
3872                       }
3873                   }
3874             }
3875         }
3876       else
3877 # endif /* _LIBC */
3878         {
3879           /* match with range expression?  */
3880 #if __GNUC__ >= 2
3881           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3882 #else
3883           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3884           cmp_buf[2] = wc;
3885 #endif
3886           for (i = 0; i < cset->nranges; ++i)
3887             {
3888               cmp_buf[0] = cset->range_starts[i];
3889               cmp_buf[4] = cset->range_ends[i];
3890               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3891                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3892                 {
3893                   match_len = char_len;
3894                   goto check_node_accept_bytes_match;
3895                 }
3896             }
3897         }
3898     check_node_accept_bytes_match:
3899       if (!cset->non_match)
3900         return match_len;
3901       else
3902         {
3903           if (match_len > 0)
3904             return 0;
3905           else
3906             return (elem_len > char_len) ? elem_len : char_len;
3907         }
3908     }
3909   return 0;
3910 }
3911
3912 # ifdef _LIBC
3913 static unsigned int
3914 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3915 {
3916   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3917   if (nrules == 0)
3918     {
3919       if (mbs_len == 1)
3920         {
3921           /* No valid character.  Match it as a single byte character.  */
3922           const unsigned char *collseq = (const unsigned char *)
3923             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3924           return collseq[mbs[0]];
3925         }
3926       return UINT_MAX;
3927     }
3928   else
3929     {
3930       int32_t idx;
3931       const unsigned char *extra = (const unsigned char *)
3932         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3933       int32_t extrasize = (const unsigned char *)
3934         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3935
3936       for (idx = 0; idx < extrasize;)
3937         {
3938           int mbs_cnt, found = 0;
3939           int32_t elem_mbs_len;
3940           /* Skip the name of collating element name.  */
3941           idx = idx + extra[idx] + 1;
3942           elem_mbs_len = extra[idx++];
3943           if (mbs_len == elem_mbs_len)
3944             {
3945               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3946                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3947                   break;
3948               if (mbs_cnt == elem_mbs_len)
3949                 /* Found the entry.  */
3950                 found = 1;
3951             }
3952           /* Skip the byte sequence of the collating element.  */
3953           idx += elem_mbs_len;
3954           /* Adjust for the alignment.  */
3955           idx = (idx + 3) & ~3;
3956           /* Skip the collation sequence value.  */
3957           idx += sizeof (uint32_t);
3958           /* Skip the wide char sequence of the collating element.  */
3959           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3960           /* If we found the entry, return the sequence value.  */
3961           if (found)
3962             return *(uint32_t *) (extra + idx);
3963           /* Skip the collation sequence value.  */
3964           idx += sizeof (uint32_t);
3965         }
3966       return UINT_MAX;
3967     }
3968 }
3969 # endif /* _LIBC */
3970 #endif /* RE_ENABLE_I18N */
3971
3972 /* Check whether the node accepts the byte which is IDX-th
3973    byte of the INPUT.  */
3974
3975 static int
3976 internal_function
3977 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3978                    Idx idx)
3979 {
3980   unsigned char ch;
3981   ch = re_string_byte_at (&mctx->input, idx);
3982   switch (node->type)
3983     {
3984     case CHARACTER:
3985       if (node->opr.c != ch)
3986         return 0;
3987       break;
3988
3989     case SIMPLE_BRACKET:
3990       if (!bitset_contain (node->opr.sbcset, ch))
3991         return 0;
3992       break;
3993
3994 #ifdef RE_ENABLE_I18N
3995     case OP_UTF8_PERIOD:
3996       if (ch >= 0x80)
3997         return 0;
3998       /* FALLTHROUGH */
3999 #endif
4000     case OP_PERIOD:
4001       if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4002           || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4003         return 0;
4004       break;
4005
4006     default:
4007       return 0;
4008     }
4009
4010   if (node->constraint)
4011     {
4012       /* The node has constraints.  Check whether the current context
4013          satisfies the constraints.  */
4014       unsigned int context = re_string_context_at (&mctx->input, idx,
4015                                                    mctx->eflags);
4016       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4017         return 0;
4018     }
4019
4020   return 1;
4021 }
4022
4023 /* Extend the buffers, if the buffers have run out.  */
4024
4025 static reg_errcode_t
4026 internal_function
4027 extend_buffers (re_match_context_t *mctx)
4028 {
4029   reg_errcode_t ret;
4030   re_string_t *pstr = &mctx->input;
4031
4032   /* Double the lengthes of the buffers.  */
4033   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4034   if (BE (ret != REG_NOERROR, 0))
4035     return ret;
4036
4037   if (mctx->state_log != NULL)
4038     {
4039       /* And double the length of state_log.  */
4040       /* XXX We have no indication of the size of this buffer.  If this
4041          allocation fail we have no indication that the state_log array
4042          does not have the right size.  */
4043       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4044                                               pstr->bufs_len + 1);
4045       if (BE (new_array == NULL, 0))
4046         return REG_ESPACE;
4047       mctx->state_log = new_array;
4048     }
4049
4050   /* Then reconstruct the buffers.  */
4051   if (pstr->icase)
4052     {
4053 #ifdef RE_ENABLE_I18N
4054       if (pstr->mb_cur_max > 1)
4055         {
4056           ret = build_wcs_upper_buffer (pstr);
4057           if (BE (ret != REG_NOERROR, 0))
4058             return ret;
4059         }
4060       else
4061 #endif /* RE_ENABLE_I18N  */
4062         build_upper_buffer (pstr);
4063     }
4064   else
4065     {
4066 #ifdef RE_ENABLE_I18N
4067       if (pstr->mb_cur_max > 1)
4068         build_wcs_buffer (pstr);
4069       else
4070 #endif /* RE_ENABLE_I18N  */
4071         {
4072           if (pstr->trans != NULL)
4073             re_string_translate_buffer (pstr);
4074         }
4075     }
4076   return REG_NOERROR;
4077 }
4078
4079 \f
4080 /* Functions for matching context.  */
4081
4082 /* Initialize MCTX.  */
4083
4084 static reg_errcode_t
4085 internal_function
4086 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4087 {
4088   mctx->eflags = eflags;
4089   mctx->match_last = REG_MISSING;
4090   if (n > 0)
4091     {
4092       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4093       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4094       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4095         return REG_ESPACE;
4096     }
4097   /* Already zero-ed by the caller.
4098      else
4099        mctx->bkref_ents = NULL;
4100      mctx->nbkref_ents = 0;
4101      mctx->nsub_tops = 0;  */
4102   mctx->abkref_ents = n;
4103   mctx->max_mb_elem_len = 1;
4104   mctx->asub_tops = n;
4105   return REG_NOERROR;
4106 }
4107
4108 /* Clean the entries which depend on the current input in MCTX.
4109    This function must be invoked when the matcher changes the start index
4110    of the input, or changes the input string.  */
4111
4112 static void
4113 internal_function
4114 match_ctx_clean (re_match_context_t *mctx)
4115 {
4116   Idx st_idx;
4117   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4118     {
4119       Idx sl_idx;
4120       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4121       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4122         {
4123           re_sub_match_last_t *last = top->lasts[sl_idx];
4124           re_free (last->path.array);
4125           re_free (last);
4126         }
4127       re_free (top->lasts);
4128       if (top->path)
4129         {
4130           re_free (top->path->array);
4131           re_free (top->path);
4132         }
4133       free (top);
4134     }
4135
4136   mctx->nsub_tops = 0;
4137   mctx->nbkref_ents = 0;
4138 }
4139
4140 /* Free all the memory associated with MCTX.  */
4141
4142 static void
4143 internal_function
4144 match_ctx_free (re_match_context_t *mctx)
4145 {
4146   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4147   match_ctx_clean (mctx);
4148   re_free (mctx->sub_tops);
4149   re_free (mctx->bkref_ents);
4150 }
4151
4152 /* Add a new backreference entry to MCTX.
4153    Note that we assume that caller never call this function with duplicate
4154    entry, and call with STR_IDX which isn't smaller than any existing entry.
4155 */
4156
4157 static reg_errcode_t
4158 internal_function
4159 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4160                      Idx from, Idx to)
4161 {
4162   if (mctx->nbkref_ents >= mctx->abkref_ents)
4163     {
4164       struct re_backref_cache_entry* new_entry;
4165       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4166                               mctx->abkref_ents * 2);
4167       if (BE (new_entry == NULL, 0))
4168         {
4169           re_free (mctx->bkref_ents);
4170           return REG_ESPACE;
4171         }
4172       mctx->bkref_ents = new_entry;
4173       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4174               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4175       mctx->abkref_ents *= 2;
4176     }
4177   if (mctx->nbkref_ents > 0
4178       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4179     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4180
4181   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4182   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4183   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4184   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4185
4186   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4187      If bit N is clear, means that this entry won't epsilon-transition to
4188      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4189      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4190      such node.
4191
4192      A backreference does not epsilon-transition unless it is empty, so set
4193      to all zeros if FROM != TO.  */
4194   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4195     = (from == to ? -1 : 0);
4196
4197   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4198   if (mctx->max_mb_elem_len < to - from)
4199     mctx->max_mb_elem_len = to - from;
4200   return REG_NOERROR;
4201 }
4202
4203 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4204    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4205
4206 static Idx
4207 internal_function
4208 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4209 {
4210   Idx left, right, mid, last;
4211   last = right = mctx->nbkref_ents;
4212   for (left = 0; left < right;)
4213     {
4214       mid = (left + right) / 2;
4215       if (mctx->bkref_ents[mid].str_idx < str_idx)
4216         left = mid + 1;
4217       else
4218         right = mid;
4219     }
4220   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4221     return left;
4222   else
4223     return REG_MISSING;
4224 }
4225
4226 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4227    at STR_IDX.  */
4228
4229 static reg_errcode_t
4230 internal_function
4231 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4232 {
4233 #ifdef DEBUG
4234   assert (mctx->sub_tops != NULL);
4235   assert (mctx->asub_tops > 0);
4236 #endif
4237   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4238     {
4239       Idx new_asub_tops = mctx->asub_tops * 2;
4240       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4241                                                    re_sub_match_top_t *,
4242                                                    new_asub_tops);
4243       if (BE (new_array == NULL, 0))
4244         return REG_ESPACE;
4245       mctx->sub_tops = new_array;
4246       mctx->asub_tops = new_asub_tops;
4247     }
4248   mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4249   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4250     return REG_ESPACE;
4251   mctx->sub_tops[mctx->nsub_tops]->node = node;
4252   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4253   return REG_NOERROR;
4254 }
4255
4256 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4257    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4258
4259 static re_sub_match_last_t *
4260 internal_function
4261 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4262 {
4263   re_sub_match_last_t *new_entry;
4264   if (BE (subtop->nlasts == subtop->alasts, 0))
4265     {
4266       Idx new_alasts = 2 * subtop->alasts + 1;
4267       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4268                                                     re_sub_match_last_t *,
4269                                                     new_alasts);
4270       if (BE (new_array == NULL, 0))
4271         return NULL;
4272       subtop->lasts = new_array;
4273       subtop->alasts = new_alasts;
4274     }
4275   new_entry = re_calloc (re_sub_match_last_t, 1);
4276   if (BE (new_entry != NULL, 1))
4277     {
4278       subtop->lasts[subtop->nlasts] = new_entry;
4279       new_entry->node = node;
4280       new_entry->str_idx = str_idx;
4281       ++subtop->nlasts;
4282     }
4283   return new_entry;
4284 }
4285
4286 static void
4287 internal_function
4288 sift_ctx_init (re_sift_context_t *sctx,
4289                re_dfastate_t **sifted_sts,
4290                re_dfastate_t **limited_sts,
4291                Idx last_node, Idx last_str_idx)
4292 {
4293   sctx->sifted_states = sifted_sts;
4294   sctx->limited_states = limited_sts;
4295   sctx->last_node = last_node;
4296   sctx->last_str_idx = last_str_idx;
4297   re_node_set_init_empty (&sctx->limits);
4298 }