lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      Idx n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
  25                                           Idx str_idx, Idx from, Idx to)
  26      internal_function;
  27 static Idx search_cur_bkref_entry (re_match_context_t *mctx, Idx str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
  30                                            Idx str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                     Idx node, Idx str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, Idx last_node,
  36                            Idx last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, Idx length,
  40                                          Idx start, regoff_t range, Idx stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
  44                                   const char *string1, Idx length1,
  45                                   const char *string2, Idx length2,
  46                                   Idx start, regoff_t range,
  47                                   struct re_registers *regs,
  48                                   Idx stop, int ret_len) internal_function;
  49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
  50                                 const char *string, Idx length, Idx start,
  51                                 regoff_t range, Idx stop,
  52                                 struct re_registers *regs,
  53                                 int ret_len) internal_function;
  54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  55                               Idx nregs, int regs_allocated) internal_function;
  56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  57      internal_function;
  58 static Idx check_matching (re_match_context_t *mctx, int fl_longest_match,
  59                            Idx *p_match_first)
  60      internal_function;
  61 static Idx check_halt_state_context (const re_match_context_t *mctx,
  62                                      const re_dfastate_t *state, Idx idx)
  63      internal_function;
  64 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  65                          regmatch_t *prev_idx_match, Idx cur_node,
  66                          Idx cur_idx, Idx nmatch) internal_function;
  67 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  68                                       Idx str_idx, Idx dest_node, Idx nregs,
  69                                       regmatch_t *regs,
  70                                       re_node_set *eps_via_nodes) internal_function;
  71 static reg_errcode_t set_regs (const regex_t *preg,
  72                                const re_match_context_t *mctx,
  73                                size_t nmatch, regmatch_t *pmatch,
  74                                int fl_backtrack) internal_function;
  75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
  81 #endif /* RE_ENABLE_I18N */
  82 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  83                                            re_sift_context_t *sctx) internal_function;
  84 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  85                                           re_sift_context_t *sctx, Idx str_idx,
  86                                           re_node_set *cur_dest) internal_function;
  87 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  88                                               re_sift_context_t *sctx,
  89                                               Idx str_idx,
  90                                               re_node_set *dest_nodes) internal_function;
  91 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
  92                                             re_node_set *dest_nodes,
  93                                             const re_node_set *candidates) internal_function;
  94 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
  95                              Idx dst_node, Idx dst_idx, Idx src_node,
  96                              Idx src_idx) internal_function;
  97 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
  98                                         int boundaries, Idx subexp_idx,
  99                                         Idx from_node, Idx bkref_idx) internal_function;
 100 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 101                                       Idx limit, Idx subexp_idx,
 102                                       Idx node, Idx str_idx,
 103                                       Idx bkref_idx) internal_function;
 104 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 105                                           re_node_set *dest_nodes,
 106                                           const re_node_set *candidates,
 107                                           re_node_set *limits,
 108                                           struct re_backref_cache_entry *bkref_ents,
 109                                           Idx str_idx) internal_function;
 110 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 111                                         re_sift_context_t *sctx,
 112                                         Idx str_idx, const re_node_set *candidates) internal_function;
 113 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 114                                         re_dfastate_t **src, Idx num) internal_function;
 115 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 116                                          re_match_context_t *mctx) internal_function;
 117 static re_dfastate_t *transit_state (reg_errcode_t *err,
 118                                      re_match_context_t *mctx,
 119                                      re_dfastate_t *state) internal_function;
 120 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 121                                             re_match_context_t *mctx,
 122                                             re_dfastate_t *next_state) internal_function;
 123 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 124                                                 re_node_set *cur_nodes,
 125                                                 Idx str_idx) internal_function;
 126 #if 0
 127 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 128                                         re_match_context_t *mctx,
 129                                         re_dfastate_t *pstate) internal_function;
 130 #endif
 131 #ifdef RE_ENABLE_I18N
 132 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 133                                        re_dfastate_t *pstate) internal_function;
 134 #endif /* RE_ENABLE_I18N */
 135 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 136                                           const re_node_set *nodes) internal_function;
 137 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 138                                  Idx bkref_node, Idx bkref_str_idx) internal_function;
 139 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 140                                      const re_sub_match_top_t *sub_top,
 141                                      re_sub_match_last_t *sub_last,
 142                                      Idx bkref_node, Idx bkref_str) internal_function;
 143 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 144                              Idx subexp_idx, int type) internal_function;
 145 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 146                                     state_array_t *path, Idx top_node,
 147                                     Idx top_str, Idx last_node, Idx last_str,
 148                                     int type) internal_function;
 149 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 150                                                    Idx str_idx,
 151                                                    re_node_set *cur_nodes,
 152                                                    re_node_set *next_nodes) internal_function;
 153 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 154                                                re_node_set *cur_nodes,
 155                                                Idx ex_subexp, int type) internal_function;
 156 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 157                                                    re_node_set *dst_nodes,
 158                                                    Idx target, Idx ex_subexp,
 159                                                    int type) internal_function;
 160 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 161                                          re_node_set *cur_nodes, Idx cur_str,
 162                                          Idx subexp_num, int type) internal_function;
 163 static int build_trtable (re_dfa_t *dfa,
 164                           re_dfastate_t *state) internal_function;
 165 #ifdef RE_ENABLE_I18N
 166 static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
 167                                     const re_string_t *input, Idx idx) internal_function;
 168 # ifdef _LIBC
 169 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 170                                                    size_t name_len) internal_function;
 171 # endif /* _LIBC */
 172 #endif /* RE_ENABLE_I18N */
 173 static Idx group_nodes_into_DFAstates (re_dfa_t *dfa,
 174                                        const re_dfastate_t *state,
 175                                        re_node_set *states_node,
 176                                        bitset *states_ch) internal_function;
 177 static int check_node_accept (const re_match_context_t *mctx,
 178                               const re_token_t *node, Idx idx) internal_function;
 179 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 180 \f
 181 /* Entry point for POSIX code.  */
 182
 183 /* regexec searches for a given pattern, specified by PREG, in the
 184    string STRING.
 185
 186    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 187    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 188    least NMATCH elements, and we set them to the offsets of the
 189    corresponding matched substrings.
 190
 191    EFLAGS specifies `execution flags' which affect matching: if
 192    REG_NOTBOL is set, then ^ does not match at the beginning of the
 193    string; if REG_NOTEOL is set, then $ does not match at the end.
 194
 195    We return 0 if we find a match and REG_NOMATCH if not.  */
 196
 197 int
 198 regexec (const regex_t *__restrict preg, const char *__restrict string,
 199          size_t nmatch, regmatch_t pmatch[], int eflags)
 200 {
 201   reg_errcode_t err;
 202   Idx start, length;
 203 #ifdef _LIBC
 204   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 205 #endif
 206
 207   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 208     return REG_BADPAT;
 209
 210   if (eflags & REG_STARTEND)
 211     {
 212       start = pmatch[0].rm_so;
 213       length = pmatch[0].rm_eo;
 214     }
 215   else
 216     {
 217       start = 0;
 218       length = strlen (string);
 219     }
 220
 221   __libc_lock_lock (dfa->lock);
 222   if (preg->re_no_sub)
 223     err = re_search_internal (preg, string, length, start, length - start,
 224                               length, 0, NULL, eflags);
 225   else
 226     err = re_search_internal (preg, string, length, start, length - start,
 227                               length, nmatch, pmatch, eflags);
 228   __libc_lock_unlock (dfa->lock);
 229   return err != REG_NOERROR;
 230 }
 231
 232 #ifdef _LIBC
 233 # include <shlib-compat.h>
 234 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 235
 236 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 237 __typeof__ (__regexec) __compat_regexec;
 238
 239 int
 240 attribute_compat_text_section
 241 __compat_regexec (const regex_t *__restrict preg,
 242                   const char *__restrict string, size_t nmatch,
 243                   regmatch_t pmatch[], int eflags)
 244 {
 245   return regexec (preg, string, nmatch, pmatch,
 246                   eflags & (REG_NOTBOL | REG_NOTEOL));
 247 }
 248 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 249 # endif
 250 #endif
 251
 252 /* Entry points for GNU code.  */
 253
 254 /* re_match, re_search, re_match_2, re_search_2
 255
 256    The former two functions operate on STRING with length LENGTH,
 257    while the later two operate on concatenation of STRING1 and STRING2
 258    with lengths LENGTH1 and LENGTH2, respectively.
 259
 260    re_match() matches the compiled pattern in BUFP against the string,
 261    starting at index START.
 262
 263    re_search() first tries matching at index START, then it tries to match
 264    starting from index START + 1, and so on.  The last start position tried
 265    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 266    way as re_match().)
 267
 268    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 269    the first STOP characters of the concatenation of the strings should be
 270    concerned.
 271
 272    If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
 273    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 274    computed relative to the concatenation, not relative to the individual
 275    strings.)
 276
 277    On success, re_match* functions return the length of the match, re_search*
 278    return the position of the start of the match.  Return value -1 means no
 279    match was found and -2 indicates an internal error.  */
 280
 281 regoff_t
 282 re_match (struct re_pattern_buffer *bufp, const char *string,
 283           Idx length, Idx start, struct re_registers *regs)
 284 {
 285   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 286 }
 287 #ifdef _LIBC
 288 weak_alias (__re_match, re_match)
 289 #endif
 290
 291 regoff_t
 292 re_search (struct re_pattern_buffer *bufp, const char *string,
 293            Idx length, Idx start, regoff_t range, struct re_registers *regs)
 294 {
 295   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 296 }
 297 #ifdef _LIBC
 298 weak_alias (__re_search, re_search)
 299 #endif
 300
 301 regoff_t
 302 re_match_2 (struct re_pattern_buffer *bufp,
 303             const char *string1, Idx length1,
 304             const char *string2, Idx length2,
 305             Idx start, struct re_registers *regs, Idx stop)
 306 {
 307   return re_search_2_stub (bufp, string1, length1, string2, length2,
 308                            start, 0, regs, stop, 1);
 309 }
 310 #ifdef _LIBC
 311 weak_alias (__re_match_2, re_match_2)
 312 #endif
 313
 314 regoff_t
 315 re_search_2 (struct re_pattern_buffer *bufp,
 316              const char *string1, Idx length1,
 317              const char *string2, Idx length2,
 318              Idx start, regoff_t range, struct re_registers *regs, Idx stop)
 319 {
 320   return re_search_2_stub (bufp, string1, length1, string2, length2,
 321                            start, range, regs, stop, 0);
 322 }
 323 #ifdef _LIBC
 324 weak_alias (__re_search_2, re_search_2)
 325 #endif
 326
 327 static regoff_t
 328 internal_function
 329 re_search_2_stub (struct re_pattern_buffer *bufp,
 330                   const char *string1, Idx length1,
 331                   const char *string2, Idx length2,
 332                   Idx start, regoff_t range, struct re_registers *regs,
 333                   Idx stop, int ret_len)
 334 {
 335   const char *str;
 336   regoff_t rval;
 337   Idx len = length1 + length2;
 338   int free_str = 0;
 339
 340   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 341     return -2;
 342
 343   /* Concatenate the strings.  */
 344   if (length2 > 0)
 345     if (length1 > 0)
 346       {
 347         char *s = re_malloc (char, len);
 348
 349         if (BE (s == NULL, 0))
 350           return -2;
 351         memcpy (s, string1, length1);
 352         memcpy (s + length1, string2, length2);
 353         str = s;
 354         free_str = 1;
 355       }
 356     else
 357       str = string2;
 358   else
 359     str = string1;
 360
 361   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 362                          ret_len);
 363   if (free_str)
 364     re_free ((char *) str);
 365   return rval;
 366 }
 367
 368 /* The parameters have the same meaning as those of re_search.
 369    Additional parameters:
 370    If RET_LEN is nonzero the length of the match is returned (re_match style);
 371    otherwise the position of the match is returned.  */
 372
 373 static regoff_t
 374 internal_function
 375 re_search_stub (struct re_pattern_buffer *bufp,
 376                 const char *string, Idx length,
 377                 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
 378                 int ret_len)
 379 {
 380   reg_errcode_t result;
 381   regmatch_t *pmatch;
 382   Idx nregs;
 383   regoff_t rval;
 384   int eflags = 0;
 385 #ifdef _LIBC
 386   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 387 #endif
 388
 389   /* Check for out-of-range.  */
 390   if (BE (start < 0 || start > length, 0))
 391     return -1;
 392   if (sizeof start < sizeof range)
 393     {
 394       regoff_t length_offset = length;
 395       regoff_t start_offset = start;
 396       if (BE (length_offset - start_offset < range, 0))
 397         range = length_offset - start_offset;
 398       else if (BE (range < - start_offset, 0))
 399         range = -start_offset;
 400     }
 401   else
 402     {
 403       if (BE (start + range > length, 0))
 404         range = length - start;
 405       else if (BE (start + range < 0, 0))
 406         range = -start;
 407     }
 408
 409   __libc_lock_lock (dfa->lock);
 410
 411   eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
 412   eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
 413
 414   /* Compile fastmap if we haven't yet.  */
 415   if (range > 0 && bufp->re_fastmap != NULL && !bufp->re_fastmap_accurate)
 416     re_compile_fastmap (bufp);
 417
 418   if (BE (bufp->re_no_sub, 0))
 419     regs = NULL;
 420
 421   /* We need at least 1 register.  */
 422   if (regs == NULL)
 423     nregs = 1;
 424   else if (BE (bufp->re_regs_allocated == REG_FIXED
 425                && regs->rm_num_regs < bufp->re_nsub + 1, 0))
 426     {
 427       nregs = regs->rm_num_regs;
 428       if (BE (nregs < 1, 0))
 429         {
 430           /* Nothing can be copied to regs.  */
 431           regs = NULL;
 432           nregs = 1;
 433         }
 434     }
 435   else
 436     nregs = bufp->re_nsub + 1;
 437   pmatch = re_malloc (regmatch_t, nregs);
 438   if (BE (pmatch == NULL, 0))
 439     {
 440       rval = -2;
 441       goto out;
 442     }
 443
 444   result = re_search_internal (bufp, string, length, start, range, stop,
 445                                nregs, pmatch, eflags);
 446
 447   rval = 0;
 448
 449   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 450   if (result != REG_NOERROR)
 451     rval = -1;
 452   else if (regs != NULL)
 453     {
 454       /* If caller wants register contents data back, copy them.  */
 455       bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
 456                                               bufp->re_regs_allocated);
 457       if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
 458         rval = -2;
 459     }
 460
 461   if (BE (rval == 0, 1))
 462     {
 463       if (ret_len)
 464         {
 465           assert (pmatch[0].rm_so == start);
 466           rval = pmatch[0].rm_eo - start;
 467         }
 468       else
 469         rval = pmatch[0].rm_so;
 470     }
 471   re_free (pmatch);
 472  out:
 473   __libc_lock_unlock (dfa->lock);
 474   return rval;
 475 }
 476
 477 static unsigned
 478 internal_function
 479 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
 480               int regs_allocated)
 481 {
 482   int rval = REG_REALLOCATE;
 483   Idx i;
 484   Idx need_regs = nregs + 1;
 485   /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
 486      uses.  */
 487
 488   /* Have the register data arrays been allocated?  */
 489   if (regs_allocated == REG_UNALLOCATED)
 490     { /* No.  So allocate them with malloc.  */
 491       regs->rm_start = re_malloc (regoff_t, need_regs);
 492       regs->rm_end = re_malloc (regoff_t, need_regs);
 493       if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
 494         return REG_UNALLOCATED;
 495       regs->rm_num_regs = need_regs;
 496     }
 497   else if (regs_allocated == REG_REALLOCATE)
 498     { /* Yes.  If we need more elements than were already
 499          allocated, reallocate them.  If we need fewer, just
 500          leave it alone.  */
 501       if (BE (need_regs > regs->rm_num_regs, 0))
 502         {
 503           regoff_t *new_start =
 504             re_realloc (regs->rm_start, regoff_t, need_regs);
 505           regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
 506           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 507             return REG_UNALLOCATED;
 508           regs->rm_start = new_start;
 509           regs->rm_end = new_end;
 510           regs->rm_num_regs = need_regs;
 511         }
 512     }
 513   else
 514     {
 515       assert (regs_allocated == REG_FIXED);
 516       /* This function may not be called with REG_FIXED and nregs too big.  */
 517       assert (regs->rm_num_regs >= nregs);
 518       rval = REG_FIXED;
 519     }
 520
 521   /* Copy the regs.  */
 522   for (i = 0; i < nregs; ++i)
 523     {
 524       regs->rm_start[i] = pmatch[i].rm_so;
 525       regs->rm_end[i] = pmatch[i].rm_eo;
 526     }
 527   for ( ; i < regs->rm_num_regs; ++i)
 528     regs->rm_start[i] = regs->rm_end[i] = -1;
 529
 530   return rval;
 531 }
 532
 533 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 534    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 535    this memory for recording register information.  STARTS and ENDS
 536    must be allocated using the malloc library routine, and must each
 537    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 538
 539    If NUM_REGS == 0, then subsequent matches should allocate their own
 540    register data.
 541
 542    Unless this function is called, the first search or match using
 543    PATTERN_BUFFER will allocate its own register data, without
 544    freeing the old data.  */
 545
 546 void
 547 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 548                   __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
 549 {
 550   if (num_regs)
 551     {
 552       bufp->re_regs_allocated = REG_REALLOCATE;
 553       regs->rm_num_regs = num_regs;
 554       regs->rm_start = starts;
 555       regs->rm_end = ends;
 556     }
 557   else
 558     {
 559       bufp->re_regs_allocated = REG_UNALLOCATED;
 560       regs->rm_num_regs = 0;
 561       regs->rm_start = regs->rm_end = NULL;
 562     }
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__re_set_registers, re_set_registers)
 566 #endif
 567 \f
 568 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 569    them unless specifically requested.  */
 570
 571 #if defined _REGEX_RE_COMP || defined _LIBC
 572 int
 573 # ifdef _LIBC
 574 weak_function
 575 # endif
 576 re_exec (const char *s)
 577 {
 578   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 579 }
 580 #endif /* _REGEX_RE_COMP */
 581 \f
 582 /* Internal entry point.  */
 583
 584 /* Searches for a compiled pattern PREG in the string STRING, whose
 585    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 586    mingings with regexec.  START, and RANGE have the same meanings
 587    with re_search.
 588    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 589    otherwise return the error code.
 590    Note: We assume front end functions already check ranges.
 591    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 592
 593 static reg_errcode_t
 594 internal_function
 595 re_search_internal (const regex_t *preg,
 596                     const char *string, Idx length,
 597                     Idx start, regoff_t range, Idx stop,
 598                     size_t nmatch, regmatch_t pmatch[],
 599                     int eflags)
 600 {
 601   reg_errcode_t err;
 602   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 603   Idx left_lim, right_lim;
 604   int incr;
 605   int fl_longest_match, match_kind;
 606   Idx match_first, match_last = REG_MISSING;
 607   Idx extra_nmatch;
 608   int sb, ch;
 609 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 610   re_match_context_t mctx = { .dfa = dfa };
 611 #else
 612   re_match_context_t mctx;
 613 #endif
 614   char *fastmap = (preg->re_fastmap != NULL && preg->re_fastmap_accurate
 615                    && range && !preg->re_can_be_null) ? preg->re_fastmap : NULL;
 616   unsigned REG_TRANSLATE_TYPE t =
 617     (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
 618
 619 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 620   memset (&mctx, '\0', sizeof (re_match_context_t));
 621   mctx.dfa = dfa;
 622 #endif
 623
 624   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 625   nmatch -= extra_nmatch;
 626
 627   /* Check if the DFA haven't been compiled.  */
 628   if (BE (preg->re_used == 0 || dfa->init_state == NULL
 629           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 630           || dfa->init_state_begbuf == NULL, 0))
 631     return REG_NOMATCH;
 632
 633 #ifdef DEBUG
 634   /* We assume front-end functions already check them.  */
 635   assert (start + range >= 0 && start + range <= length);
 636 #endif
 637
 638   /* If initial states with non-begbuf contexts have no elements,
 639      the regex must be anchored.  If preg->re_newline_anchor is set,
 640      we'll never use init_state_nl, so do not check it.  */
 641   if (dfa->init_state->nodes.nelem == 0
 642       && dfa->init_state_word->nodes.nelem == 0
 643       && (dfa->init_state_nl->nodes.nelem == 0
 644           || !preg->re_newline_anchor))
 645     {
 646       if (start != 0 && start + range != 0)
 647         return REG_NOMATCH;
 648       start = range = 0;
 649     }
 650
 651   /* We must check the longest matching, if nmatch > 0.  */
 652   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 653
 654   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 655                             preg->re_translate,
 656                             preg->re_syntax & REG_IGNORE_CASE, dfa);
 657   if (BE (err != REG_NOERROR, 0))
 658     goto free_return;
 659   mctx.input.stop = stop;
 660   mctx.input.raw_stop = stop;
 661   mctx.input.newline_anchor = preg->re_newline_anchor;
 662
 663   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 664   if (BE (err != REG_NOERROR, 0))
 665     goto free_return;
 666
 667   /* We will log all the DFA states through which the dfa pass,
 668      if nmatch > 1, or this dfa has "multibyte node", which is a
 669      back-reference or a node which can accept multibyte character or
 670      multi character collating element.  */
 671   if (nmatch > 1 || dfa->has_mb_node)
 672     {
 673       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 674       if (BE (mctx.state_log == NULL, 0))
 675         {
 676           err = REG_ESPACE;
 677           goto free_return;
 678         }
 679     }
 680   else
 681     mctx.state_log = NULL;
 682
 683   match_first = start;
 684   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 685                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 686
 687   /* Check incrementally whether of not the input string match.  */
 688   incr = (range < 0) ? -1 : 1;
 689   left_lim = (range < 0) ? start + range : start;
 690   right_lim = (range < 0) ? start : start + range;
 691   sb = dfa->mb_cur_max == 1;
 692   match_kind =
 693     (fastmap
 694      ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
 695         | (range >= 0 ? 2 : 0)
 696         | (t != NULL ? 1 : 0))
 697      : 8);
 698
 699   for (;; match_first += incr)
 700     {
 701       err = REG_NOMATCH;
 702       if (match_first < left_lim || right_lim < match_first)
 703         goto free_return;
 704
 705       /* Advance as rapidly as possible through the string, until we
 706          find a plausible place to start matching.  This may be done
 707          with varying efficiency, so there are various possibilities:
 708          only the most common of them are specialized, in order to
 709          save on code size.  We use a switch statement for speed.  */
 710       switch (match_kind)
 711         {
 712         case 8:
 713           /* No fastmap.  */
 714           break;
 715
 716         case 7:
 717           /* Fastmap with single-byte translation, match forward.  */
 718           while (BE (match_first < right_lim, 1)
 719                  && !fastmap[t[(unsigned char) string[match_first]]])
 720             ++match_first;
 721           goto forward_match_found_start_or_reached_end;
 722
 723         case 6:
 724           /* Fastmap without translation, match forward.  */
 725           while (BE (match_first < right_lim, 1)
 726                  && !fastmap[(unsigned char) string[match_first]])
 727             ++match_first;
 728
 729         forward_match_found_start_or_reached_end:
 730           if (BE (match_first == right_lim, 0))
 731             {
 732               ch = match_first >= length
 733                        ? 0 : (unsigned char) string[match_first];
 734               if (!fastmap[t ? t[ch] : ch])
 735                 goto free_return;
 736             }
 737           break;
 738
 739         case 4:
 740         case 5:
 741           /* Fastmap without multi-byte translation, match backwards.  */
 742           while (match_first >= left_lim)
 743             {
 744               ch = match_first >= length
 745                        ? 0 : (unsigned char) string[match_first];
 746               if (fastmap[t ? t[ch] : ch])
 747                 break;
 748               --match_first;
 749             }
 750           if (match_first < left_lim)
 751             goto free_return;
 752           break;
 753
 754         default:
 755           /* In this case, we can't determine easily the current byte,
 756              since it might be a component byte of a multibyte
 757              character.  Then we use the constructed buffer instead.  */
 758           for (;;)
 759             {
 760               /* If MATCH_FIRST is out of the valid range, reconstruct the
 761                  buffers.  */
 762               __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
 763               if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
 764                 {
 765                   err = re_string_reconstruct (&mctx.input, match_first,
 766                                                eflags);
 767                   if (BE (err != REG_NOERROR, 0))
 768                     goto free_return;
 769
 770                   offset = match_first - mctx.input.raw_mbs_idx;
 771                 }
 772               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 773                  Note that MATCH_FIRST must not be smaller than 0.  */
 774               ch = (match_first >= length
 775                     ? 0 : re_string_byte_at (&mctx.input, offset));
 776               if (fastmap[ch])
 777                 break;
 778               match_first += incr;
 779               if (match_first < left_lim || match_first > right_lim)
 780                 {
 781                   err = REG_NOMATCH;
 782                   goto free_return;
 783                 }
 784             }
 785           break;
 786         }
 787
 788       /* Reconstruct the buffers so that the matcher can assume that
 789          the matching starts from the beginning of the buffer.  */
 790       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 791       if (BE (err != REG_NOERROR, 0))
 792         goto free_return;
 793
 794 #ifdef RE_ENABLE_I18N
 795      /* Don't consider this char as a possible match start if it part,
 796         yet isn't the head, of a multibyte character.  */
 797       if (!sb && !re_string_first_byte (&mctx.input, 0))
 798         continue;
 799 #endif
 800
 801       /* It seems to be appropriate one, then use the matcher.  */
 802       /* We assume that the matching starts from 0.  */
 803       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 804       match_last = check_matching (&mctx, fl_longest_match,
 805                                    range >= 0 ? &match_first : NULL);
 806       if (match_last != REG_MISSING)
 807         {
 808           if (BE (match_last == REG_ERROR, 0))
 809             {
 810               err = REG_ESPACE;
 811               goto free_return;
 812             }
 813           else
 814             {
 815               mctx.match_last = match_last;
 816               if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
 817                 {
 818                   re_dfastate_t *pstate = mctx.state_log[match_last];
 819                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 820                                                              match_last);
 821                 }
 822               if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
 823                   || dfa->nbackref)
 824                 {
 825                   err = prune_impossible_nodes (&mctx);
 826                   if (err == REG_NOERROR)
 827                     break;
 828                   if (BE (err != REG_NOMATCH, 0))
 829                     goto free_return;
 830                   match_last = REG_MISSING;
 831                 }
 832               else
 833                 break; /* We found a match.  */
 834             }
 835         }
 836
 837       match_ctx_clean (&mctx);
 838     }
 839
 840 #ifdef DEBUG
 841   assert (match_last != REG_MISSING);
 842   assert (err == REG_NOERROR);
 843 #endif
 844
 845   /* Set pmatch[] if we need.  */
 846   if (nmatch > 0)
 847     {
 848       Idx reg_idx;
 849
 850       /* Initialize registers.  */
 851       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 852         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 853
 854       /* Set the points where matching start/end.  */
 855       pmatch[0].rm_so = 0;
 856       pmatch[0].rm_eo = mctx.match_last;
 857       /* FIXME: This function should fail if mctx.match_last exceeds
 858          the maximum possible regoff_t value.  We need a new error
 859          code REG_OVERFLOW.  */
 860
 861       if (!preg->re_no_sub && nmatch > 1)
 862         {
 863           err = set_regs (preg, &mctx, nmatch, pmatch,
 864                           dfa->has_plural_match && dfa->nbackref > 0);
 865           if (BE (err != REG_NOERROR, 0))
 866             goto free_return;
 867         }
 868
 869       /* At last, add the offset to the each registers, since we slided
 870          the buffers so that we could assume that the matching starts
 871          from 0.  */
 872       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 873         if (pmatch[reg_idx].rm_so != -1)
 874           {
 875 #ifdef RE_ENABLE_I18N
 876             if (BE (mctx.input.offsets_needed != 0, 0))
 877               {
 878                 pmatch[reg_idx].rm_so =
 879                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 880                    ? mctx.input.valid_raw_len
 881                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 882                 pmatch[reg_idx].rm_eo =
 883                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 884                    ? mctx.input.valid_raw_len
 885                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 886               }
 887 #else
 888             assert (mctx.input.offsets_needed == 0);
 889 #endif
 890             pmatch[reg_idx].rm_so += match_first;
 891             pmatch[reg_idx].rm_eo += match_first;
 892           }
 893       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 894         {
 895           pmatch[nmatch + reg_idx].rm_so = -1;
 896           pmatch[nmatch + reg_idx].rm_eo = -1;
 897         }
 898
 899       if (dfa->subexp_map)
 900         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 901           if (dfa->subexp_map[reg_idx] != reg_idx)
 902             {
 903               pmatch[reg_idx + 1].rm_so
 904                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 905               pmatch[reg_idx + 1].rm_eo
 906                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 907             }
 908     }
 909
 910  free_return:
 911   re_free (mctx.state_log);
 912   if (dfa->nbackref)
 913     match_ctx_free (&mctx);
 914   re_string_destruct (&mctx.input);
 915   return err;
 916 }
 917
 918 static reg_errcode_t
 919 internal_function
 920 prune_impossible_nodes (re_match_context_t *mctx)
 921 {
 922   re_dfa_t *const dfa = mctx->dfa;
 923   Idx halt_node, match_last;
 924   reg_errcode_t ret;
 925   re_dfastate_t **sifted_states;
 926   re_dfastate_t **lim_states = NULL;
 927   re_sift_context_t sctx;
 928 #ifdef DEBUG
 929   assert (mctx->state_log != NULL);
 930 #endif
 931   match_last = mctx->match_last;
 932   halt_node = mctx->last_node;
 933   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 934   if (BE (sifted_states == NULL, 0))
 935     {
 936       ret = REG_ESPACE;
 937       goto free_return;
 938     }
 939   if (dfa->nbackref)
 940     {
 941       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 942       if (BE (lim_states == NULL, 0))
 943         {
 944           ret = REG_ESPACE;
 945           goto free_return;
 946         }
 947       while (1)
 948         {
 949           memset (lim_states, '\0',
 950                   sizeof (re_dfastate_t *) * (match_last + 1));
 951           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 952                          match_last);
 953           ret = sift_states_backward (mctx, &sctx);
 954           re_node_set_free (&sctx.limits);
 955           if (BE (ret != REG_NOERROR, 0))
 956               goto free_return;
 957           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 958             break;
 959           do
 960             {
 961               --match_last;
 962               if (! REG_VALID_INDEX (match_last))
 963                 {
 964                   ret = REG_NOMATCH;
 965                   goto free_return;
 966                 }
 967             } while (mctx->state_log[match_last] == NULL
 968                      || !mctx->state_log[match_last]->halt);
 969           halt_node = check_halt_state_context (mctx,
 970                                                 mctx->state_log[match_last],
 971                                                 match_last);
 972         }
 973       ret = merge_state_array (dfa, sifted_states, lim_states,
 974                                match_last + 1);
 975       re_free (lim_states);
 976       lim_states = NULL;
 977       if (BE (ret != REG_NOERROR, 0))
 978         goto free_return;
 979     }
 980   else
 981     {
 982       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 983       ret = sift_states_backward (mctx, &sctx);
 984       re_node_set_free (&sctx.limits);
 985       if (BE (ret != REG_NOERROR, 0))
 986         goto free_return;
 987     }
 988   re_free (mctx->state_log);
 989   mctx->state_log = sifted_states;
 990   sifted_states = NULL;
 991   mctx->last_node = halt_node;
 992   mctx->match_last = match_last;
 993   ret = REG_NOERROR;
 994  free_return:
 995   re_free (sifted_states);
 996   re_free (lim_states);
 997   return ret;
 998 }
 999
1000 /* Acquire an initial state and return it.
1001    We must select appropriate initial state depending on the context,
1002    since initial states may have constraints like "\<", "^", etc..  */
1003
1004 static inline re_dfastate_t *
1005 __attribute ((always_inline)) internal_function
1006 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1007                             Idx idx)
1008 {
1009   re_dfa_t *const dfa = mctx->dfa;
1010   if (dfa->init_state->has_constraint)
1011     {
1012       unsigned int context;
1013       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1014       if (IS_WORD_CONTEXT (context))
1015         return dfa->init_state_word;
1016       else if (IS_ORDINARY_CONTEXT (context))
1017         return dfa->init_state;
1018       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1019         return dfa->init_state_begbuf;
1020       else if (IS_NEWLINE_CONTEXT (context))
1021         return dfa->init_state_nl;
1022       else if (IS_BEGBUF_CONTEXT (context))
1023         {
1024           /* It is relatively rare case, then calculate on demand.  */
1025           return re_acquire_state_context (err, dfa,
1026                                            dfa->init_state->entrance_nodes,
1027                                            context);
1028         }
1029       else
1030         /* Must not happen?  */
1031         return dfa->init_state;
1032     }
1033   else
1034     return dfa->init_state;
1035 }
1036
1037 /* Check whether the regular expression match input string INPUT or not,
1038    and return the index where the matching end.  Return REG_MISSING if
1039    there is no match, and return REG_ERROR in case of an error.
1040    FL_LONGEST_MATCH means we want the POSIX longest matching.
1041    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1042    next place where we may want to try matching.
1043    Note that the matcher assume that the maching starts from the current
1044    index of the buffer.  */
1045
1046 static Idx
1047 internal_function
1048 check_matching (re_match_context_t *mctx, int fl_longest_match,
1049                 Idx *p_match_first)
1050 {
1051   re_dfa_t *const dfa = mctx->dfa;
1052   reg_errcode_t err;
1053   Idx match = 0;
1054   Idx match_last = REG_MISSING;
1055   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1056   re_dfastate_t *cur_state;
1057   int at_init_state = p_match_first != NULL;
1058   Idx next_start_idx = cur_str_idx;
1059
1060   err = REG_NOERROR;
1061   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1062   /* An initial state must not be NULL (invalid).  */
1063   if (BE (cur_state == NULL, 0))
1064     {
1065       assert (err == REG_ESPACE);
1066       return REG_ERROR;
1067     }
1068
1069   if (mctx->state_log != NULL)
1070     {
1071       mctx->state_log[cur_str_idx] = cur_state;
1072
1073       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1074          later.  E.g. Processing back references.  */
1075       if (BE (dfa->nbackref, 0))
1076         {
1077           at_init_state = 0;
1078           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1079           if (BE (err != REG_NOERROR, 0))
1080             return err;
1081
1082           if (cur_state->has_backref)
1083             {
1084               err = transit_state_bkref (mctx, &cur_state->nodes);
1085               if (BE (err != REG_NOERROR, 0))
1086                 return err;
1087             }
1088         }
1089     }
1090
1091   /* If the RE accepts NULL string.  */
1092   if (BE (cur_state->halt, 0))
1093     {
1094       if (!cur_state->has_constraint
1095           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1096         {
1097           if (!fl_longest_match)
1098             return cur_str_idx;
1099           else
1100             {
1101               match_last = cur_str_idx;
1102               match = 1;
1103             }
1104         }
1105     }
1106
1107   while (!re_string_eoi (&mctx->input))
1108     {
1109       re_dfastate_t *old_state = cur_state;
1110       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1111
1112       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1113           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1114               && mctx->input.valid_len < mctx->input.len))
1115         {
1116           err = extend_buffers (mctx);
1117           if (BE (err != REG_NOERROR, 0))
1118             {
1119               assert (err == REG_ESPACE);
1120               return REG_ERROR;
1121             }
1122         }
1123
1124       cur_state = transit_state (&err, mctx, cur_state);
1125       if (mctx->state_log != NULL)
1126         cur_state = merge_state_with_log (&err, mctx, cur_state);
1127
1128       if (cur_state == NULL)
1129         {
1130           /* Reached the invalid state or an error.  Try to recover a valid
1131              state using the state log, if available and if we have not
1132              already found a valid (even if not the longest) match.  */
1133           if (BE (err != REG_NOERROR, 0))
1134             return REG_ERROR;
1135
1136           if (mctx->state_log == NULL
1137               || (match && !fl_longest_match)
1138               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1139             break;
1140         }
1141
1142       if (BE (at_init_state, 0))
1143         {
1144           if (old_state == cur_state)
1145             next_start_idx = next_char_idx;
1146           else
1147             at_init_state = 0;
1148         }
1149
1150       if (cur_state->halt)
1151         {
1152           /* Reached a halt state.
1153              Check the halt state can satisfy the current context.  */
1154           if (!cur_state->has_constraint
1155               || check_halt_state_context (mctx, cur_state,
1156                                            re_string_cur_idx (&mctx->input)))
1157             {
1158               /* We found an appropriate halt state.  */
1159               match_last = re_string_cur_idx (&mctx->input);
1160               match = 1;
1161
1162               /* We found a match, do not modify match_first below.  */
1163               p_match_first = NULL;
1164               if (!fl_longest_match)
1165                 break;
1166             }
1167         }
1168     }
1169
1170   if (p_match_first)
1171     *p_match_first += next_start_idx;
1172
1173   return match_last;
1174 }
1175
1176 /* Check NODE match the current context.  */
1177
1178 static int
1179 internal_function
1180 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1181 {
1182   re_token_type_t type = dfa->nodes[node].type;
1183   unsigned int constraint = dfa->nodes[node].constraint;
1184   if (type != END_OF_RE)
1185     return 0;
1186   if (!constraint)
1187     return 1;
1188   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1189     return 0;
1190   return 1;
1191 }
1192
1193 /* Check the halt state STATE match the current context.
1194    Return 0 if not match, if the node, STATE has, is a halt node and
1195    match the context, return the node.  */
1196
1197 static Idx
1198 internal_function
1199 check_halt_state_context (const re_match_context_t *mctx,
1200                           const re_dfastate_t *state, Idx idx)
1201 {
1202   Idx i;
1203   unsigned int context;
1204 #ifdef DEBUG
1205   assert (state->halt);
1206 #endif
1207   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1208   for (i = 0; i < state->nodes.nelem; ++i)
1209     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1210       return state->nodes.elems[i];
1211   return 0;
1212 }
1213
1214 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1215    corresponding to the DFA).
1216    Return the destination node, and update EPS_VIA_NODES;
1217    return REG_MISSING in case of errors.  */
1218
1219 static Idx
1220 internal_function
1221 proceed_next_node (const re_match_context_t *mctx,
1222                    Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1223                    re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1224 {
1225   re_dfa_t *const dfa = mctx->dfa;
1226   Idx i;
1227   int err;
1228   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1229     {
1230       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1231       re_node_set *edests = &dfa->edests[node];
1232       Idx dest_node;
1233       err = re_node_set_insert (eps_via_nodes, node);
1234       if (BE (err < 0, 0))
1235         return REG_ERROR;
1236       /* Pick up a valid destination, or return REG_MISSING if none
1237          is found.  */
1238       for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1239         {
1240           Idx candidate = edests->elems[i];
1241           if (!re_node_set_contains (cur_nodes, candidate))
1242             continue;
1243           if (dest_node == REG_MISSING)
1244             dest_node = candidate;
1245
1246           else
1247             {
1248               /* In order to avoid infinite loop like "(a*)*", return the second
1249                  epsilon-transition if the first was already considered.  */
1250               if (re_node_set_contains (eps_via_nodes, dest_node))
1251                 return candidate;
1252
1253               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1254               else if (fs != NULL
1255                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1256                                            eps_via_nodes))
1257                 return REG_ERROR;
1258
1259               /* We know we are going to exit.  */
1260               break;
1261             }
1262         }
1263       return dest_node;
1264     }
1265   else
1266     {
1267       Idx naccepted = 0;
1268       re_token_type_t type = dfa->nodes[node].type;
1269
1270 #ifdef RE_ENABLE_I18N
1271       if (dfa->nodes[node].accept_mb)
1272         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1273       else
1274 #endif /* RE_ENABLE_I18N */
1275       if (type == OP_BACK_REF)
1276         {
1277           Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1278           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1279           if (fs != NULL)
1280             {
1281               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1282                 return REG_MISSING;
1283               else if (naccepted)
1284                 {
1285                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1286                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1287                               naccepted) != 0)
1288                     return REG_MISSING;
1289                 }
1290             }
1291
1292           if (naccepted == 0)
1293             {
1294               Idx dest_node;
1295               err = re_node_set_insert (eps_via_nodes, node);
1296               if (BE (err < 0, 0))
1297                 return REG_ERROR;
1298               dest_node = dfa->edests[node].elems[0];
1299               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1300                                         dest_node))
1301                 return dest_node;
1302             }
1303         }
1304
1305       if (naccepted != 0
1306           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1307         {
1308           Idx dest_node = dfa->nexts[node];
1309           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1310           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1311                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1312                                                dest_node)))
1313             return REG_MISSING;
1314           re_node_set_empty (eps_via_nodes);
1315           return dest_node;
1316         }
1317     }
1318   return REG_MISSING;
1319 }
1320
1321 static reg_errcode_t
1322 internal_function
1323 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1324                  Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1325 {
1326   reg_errcode_t err;
1327   Idx num = fs->num++;
1328   if (fs->num == fs->alloc)
1329     {
1330       struct re_fail_stack_ent_t *new_array =
1331         re_realloc (fs->stack, struct re_fail_stack_ent_t, fs->alloc * 2);
1332       if (new_array == NULL)
1333         return REG_ESPACE;
1334       fs->alloc *= 2;
1335       fs->stack = new_array;
1336     }
1337   fs->stack[num].idx = str_idx;
1338   fs->stack[num].node = dest_node;
1339   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1340   if (fs->stack[num].regs == NULL)
1341     return REG_ESPACE;
1342   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1343   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1344   return err;
1345 }
1346
1347 static Idx
1348 internal_function
1349 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1350                 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1351 {
1352   Idx num = --fs->num;
1353   assert (REG_VALID_INDEX (num));
1354   *pidx = fs->stack[num].idx;
1355   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1356   re_node_set_free (eps_via_nodes);
1357   re_free (fs->stack[num].regs);
1358   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1359   return fs->stack[num].node;
1360 }
1361
1362 /* Set the positions where the subexpressions are starts/ends to registers
1363    PMATCH.
1364    Note: We assume that pmatch[0] is already set, and
1365    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1366
1367 static reg_errcode_t
1368 internal_function
1369 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1370           size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1371 {
1372   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1373   Idx idx, cur_node;
1374   re_node_set eps_via_nodes;
1375   struct re_fail_stack_t *fs;
1376   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1377   regmatch_t *prev_idx_match;
1378   int prev_idx_match_malloced = 0;
1379
1380 #ifdef DEBUG
1381   assert (nmatch > 1);
1382   assert (mctx->state_log != NULL);
1383 #endif
1384   if (fl_backtrack)
1385     {
1386       fs = &fs_body;
1387       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1388       if (fs->stack == NULL)
1389         return REG_ESPACE;
1390     }
1391   else
1392     fs = NULL;
1393
1394   cur_node = dfa->init_node;
1395   re_node_set_init_empty (&eps_via_nodes);
1396
1397   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1398     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1399   else
1400     {
1401       prev_idx_match = re_malloc (regmatch_t, nmatch);
1402       if (prev_idx_match == NULL)
1403         {
1404           free_fail_stack_return (fs);
1405           return REG_ESPACE;
1406         }
1407       prev_idx_match_malloced = 1;
1408     }
1409   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1410
1411   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1412     {
1413       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1414
1415       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1416         {
1417           Idx reg_idx;
1418           if (fs)
1419             {
1420               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1421                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1422                   break;
1423               if (reg_idx == nmatch)
1424                 {
1425                   re_node_set_free (&eps_via_nodes);
1426                   if (prev_idx_match_malloced)
1427                     re_free (prev_idx_match);
1428                   return free_fail_stack_return (fs);
1429                 }
1430               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1431                                          &eps_via_nodes);
1432             }
1433           else
1434             {
1435               re_node_set_free (&eps_via_nodes);
1436               if (prev_idx_match_malloced)
1437                 re_free (prev_idx_match);
1438               return REG_NOERROR;
1439             }
1440         }
1441
1442       /* Proceed to next node.  */
1443       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1444                                     &eps_via_nodes, fs);
1445
1446       if (BE (! REG_VALID_INDEX (cur_node), 0))
1447         {
1448           if (BE (cur_node == REG_ERROR, 0))
1449             {
1450               re_node_set_free (&eps_via_nodes);
1451               if (prev_idx_match_malloced)
1452                 re_free (prev_idx_match);
1453               free_fail_stack_return (fs);
1454               return REG_ESPACE;
1455             }
1456           if (fs)
1457             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1458                                        &eps_via_nodes);
1459           else
1460             {
1461               re_node_set_free (&eps_via_nodes);
1462               if (prev_idx_match_malloced)
1463                 re_free (prev_idx_match);
1464               return REG_NOMATCH;
1465             }
1466         }
1467     }
1468   re_node_set_free (&eps_via_nodes);
1469   if (prev_idx_match_malloced)
1470     re_free (prev_idx_match);
1471   return free_fail_stack_return (fs);
1472 }
1473
1474 static reg_errcode_t
1475 internal_function
1476 free_fail_stack_return (struct re_fail_stack_t *fs)
1477 {
1478   if (fs)
1479     {
1480       Idx fs_idx;
1481       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1482         {
1483           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1484           re_free (fs->stack[fs_idx].regs);
1485         }
1486       re_free (fs->stack);
1487     }
1488   return REG_NOERROR;
1489 }
1490
1491 static void
1492 internal_function
1493 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1494              Idx cur_node, Idx cur_idx, Idx nmatch)
1495 {
1496   int type = dfa->nodes[cur_node].type;
1497   if (type == OP_OPEN_SUBEXP)
1498     {
1499       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1500
1501       /* We are at the first node of this sub expression.  */
1502       if (reg_num < nmatch)
1503         {
1504           pmatch[reg_num].rm_so = cur_idx;
1505           pmatch[reg_num].rm_eo = -1;
1506         }
1507     }
1508   else if (type == OP_CLOSE_SUBEXP)
1509     {
1510       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1511       if (reg_num < nmatch)
1512         {
1513           /* We are at the last node of this sub expression.  */
1514           if (pmatch[reg_num].rm_so < cur_idx)
1515             {
1516               pmatch[reg_num].rm_eo = cur_idx;
1517               /* This is a non-empty match or we are not inside an optional
1518                  subexpression.  Accept this right away.  */
1519               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1520             }
1521           else
1522             {
1523               if (dfa->nodes[cur_node].opt_subexp
1524                   && prev_idx_match[reg_num].rm_so != -1)
1525                 /* We transited through an empty match for an optional
1526                    subexpression, like (a?)*, and this is not the subexp's
1527                    first match.  Copy back the old content of the registers
1528                    so that matches of an inner subexpression are undone as
1529                    well, like in ((a?))*.  */
1530                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1531               else
1532                 /* We completed a subexpression, but it may be part of
1533                    an optional one, so do not update PREV_IDX_MATCH.  */
1534                 pmatch[reg_num].rm_eo = cur_idx;
1535             }
1536         }
1537     }
1538 }
1539
1540 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1541    and sift the nodes in each states according to the following rules.
1542    Updated state_log will be wrote to STATE_LOG.
1543
1544    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1545      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1546         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1547         the LAST_NODE, we throw away the node `a'.
1548      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1549         string `s' and transit to `b':
1550         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1551            away the node `a'.
1552         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1553             thrown away, we throw away the node `a'.
1554      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1555         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1556            node `a'.
1557         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1558             we throw away the node `a'.  */
1559
1560 #define STATE_NODE_CONTAINS(state,node) \
1561   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1562
1563 static reg_errcode_t
1564 internal_function
1565 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1566 {
1567   reg_errcode_t err;
1568   int null_cnt = 0;
1569   Idx str_idx = sctx->last_str_idx;
1570   re_node_set cur_dest;
1571
1572 #ifdef DEBUG
1573   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1574 #endif
1575
1576   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1577      transit to the last_node and the last_node itself.  */
1578   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1579   if (BE (err != REG_NOERROR, 0))
1580     return err;
1581   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1582   if (BE (err != REG_NOERROR, 0))
1583     goto free_return;
1584
1585   /* Then check each states in the state_log.  */
1586   while (str_idx > 0)
1587     {
1588       /* Update counters.  */
1589       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1590       if (null_cnt > mctx->max_mb_elem_len)
1591         {
1592           memset (sctx->sifted_states, '\0',
1593                   sizeof (re_dfastate_t *) * str_idx);
1594           re_node_set_free (&cur_dest);
1595           return REG_NOERROR;
1596         }
1597       re_node_set_empty (&cur_dest);
1598       --str_idx;
1599
1600       if (mctx->state_log[str_idx])
1601         {
1602           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1603           if (BE (err != REG_NOERROR, 0))
1604             goto free_return;
1605         }
1606
1607       /* Add all the nodes which satisfy the following conditions:
1608          - It can epsilon transit to a node in CUR_DEST.
1609          - It is in CUR_SRC.
1610          And update state_log.  */
1611       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1612       if (BE (err != REG_NOERROR, 0))
1613         goto free_return;
1614     }
1615   err = REG_NOERROR;
1616  free_return:
1617   re_node_set_free (&cur_dest);
1618   return err;
1619 }
1620
1621 static reg_errcode_t
1622 internal_function
1623 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1624                      Idx str_idx, re_node_set *cur_dest)
1625 {
1626   re_dfa_t *const dfa = mctx->dfa;
1627   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1628   Idx i;
1629
1630   /* Then build the next sifted state.
1631      We build the next sifted state on `cur_dest', and update
1632      `sifted_states[str_idx]' with `cur_dest'.
1633      Note:
1634      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1635      `cur_src' points the node_set of the old `state_log[str_idx]'
1636      (with the epsilon nodes pre-filtered out).  */
1637   for (i = 0; i < cur_src->nelem; i++)
1638     {
1639       Idx prev_node = cur_src->elems[i];
1640       int naccepted = 0;
1641       int ret;
1642
1643 #ifdef DEBUG
1644       re_token_type_t type = dfa->nodes[prev_node].type;
1645       assert (!IS_EPSILON_NODE (type));
1646 #endif
1647 #ifdef RE_ENABLE_I18N
1648       /* If the node may accept `multi byte'.  */
1649       if (dfa->nodes[prev_node].accept_mb)
1650         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1651                                          str_idx, sctx->last_str_idx);
1652 #endif /* RE_ENABLE_I18N */
1653
1654       /* We don't check backreferences here.
1655          See update_cur_sifted_state().  */
1656       if (!naccepted
1657           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1658           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1659                                   dfa->nexts[prev_node]))
1660         naccepted = 1;
1661
1662       if (naccepted == 0)
1663         continue;
1664
1665       if (sctx->limits.nelem)
1666         {
1667           Idx to_idx = str_idx + naccepted;
1668           if (check_dst_limits (mctx, &sctx->limits,
1669                                 dfa->nexts[prev_node], to_idx,
1670                                 prev_node, str_idx))
1671             continue;
1672         }
1673       ret = re_node_set_insert (cur_dest, prev_node);
1674       if (BE (ret == -1, 0))
1675         return REG_ESPACE;
1676     }
1677
1678   return REG_NOERROR;
1679 }
1680
1681 /* Helper functions.  */
1682
1683 static reg_errcode_t
1684 internal_function
1685 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1686 {
1687   Idx top = mctx->state_log_top;
1688
1689   if (next_state_log_idx >= mctx->input.bufs_len
1690       || (next_state_log_idx >= mctx->input.valid_len
1691           && mctx->input.valid_len < mctx->input.len))
1692     {
1693       reg_errcode_t err;
1694       err = extend_buffers (mctx);
1695       if (BE (err != REG_NOERROR, 0))
1696         return err;
1697     }
1698
1699   if (top < next_state_log_idx)
1700     {
1701       memset (mctx->state_log + top + 1, '\0',
1702               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1703       mctx->state_log_top = next_state_log_idx;
1704     }
1705   return REG_NOERROR;
1706 }
1707
1708 static reg_errcode_t
1709 internal_function
1710 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1711                    Idx num)
1712 {
1713   Idx st_idx;
1714   reg_errcode_t err;
1715   for (st_idx = 0; st_idx < num; ++st_idx)
1716     {
1717       if (dst[st_idx] == NULL)
1718         dst[st_idx] = src[st_idx];
1719       else if (src[st_idx] != NULL)
1720         {
1721           re_node_set merged_set;
1722           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1723                                         &src[st_idx]->nodes);
1724           if (BE (err != REG_NOERROR, 0))
1725             return err;
1726           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1727           re_node_set_free (&merged_set);
1728           if (BE (err != REG_NOERROR, 0))
1729             return err;
1730         }
1731     }
1732   return REG_NOERROR;
1733 }
1734
1735 static reg_errcode_t
1736 internal_function
1737 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1738                          Idx str_idx, re_node_set *dest_nodes)
1739 {
1740   re_dfa_t *const dfa = mctx->dfa;
1741   reg_errcode_t err;
1742   const re_node_set *candidates;
1743   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1744                 : &mctx->state_log[str_idx]->nodes);
1745
1746   if (dest_nodes->nelem == 0)
1747     sctx->sifted_states[str_idx] = NULL;
1748   else
1749     {
1750       if (candidates)
1751         {
1752           /* At first, add the nodes which can epsilon transit to a node in
1753              DEST_NODE.  */
1754           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1755           if (BE (err != REG_NOERROR, 0))
1756             return err;
1757
1758           /* Then, check the limitations in the current sift_context.  */
1759           if (sctx->limits.nelem)
1760             {
1761               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1762                                          mctx->bkref_ents, str_idx);
1763               if (BE (err != REG_NOERROR, 0))
1764                 return err;
1765             }
1766         }
1767
1768       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1769       if (BE (err != REG_NOERROR, 0))
1770         return err;
1771     }
1772
1773   if (candidates && mctx->state_log[str_idx]->has_backref)
1774     {
1775       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1776       if (BE (err != REG_NOERROR, 0))
1777         return err;
1778     }
1779   return REG_NOERROR;
1780 }
1781
1782 static reg_errcode_t
1783 internal_function
1784 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1785                        const re_node_set *candidates)
1786 {
1787   reg_errcode_t err = REG_NOERROR;
1788   Idx i;
1789
1790   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1791   if (BE (err != REG_NOERROR, 0))
1792     return err;
1793
1794   if (!state->inveclosure.alloc)
1795     {
1796       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1797       if (BE (err != REG_NOERROR, 0))
1798         return REG_ESPACE;
1799       for (i = 0; i < dest_nodes->nelem; i++)
1800         re_node_set_merge (&state->inveclosure,
1801                            dfa->inveclosures + dest_nodes->elems[i]);
1802     }
1803   return re_node_set_add_intersect (dest_nodes, candidates,
1804                                     &state->inveclosure);
1805 }
1806
1807 static reg_errcode_t
1808 internal_function
1809 sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1810                        const re_node_set *candidates)
1811 {
1812     Idx ecl_idx;
1813     reg_errcode_t err;
1814     re_node_set *inv_eclosure = dfa->inveclosures + node;
1815     re_node_set except_nodes;
1816     re_node_set_init_empty (&except_nodes);
1817     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1818       {
1819         Idx cur_node = inv_eclosure->elems[ecl_idx];
1820         if (cur_node == node)
1821           continue;
1822         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1823           {
1824             Idx edst1 = dfa->edests[cur_node].elems[0];
1825             Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1826                          ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1827             if ((!re_node_set_contains (inv_eclosure, edst1)
1828                  && re_node_set_contains (dest_nodes, edst1))
1829                 || (REG_VALID_NONZERO_INDEX (edst2)
1830                     && !re_node_set_contains (inv_eclosure, edst2)
1831                     && re_node_set_contains (dest_nodes, edst2)))
1832               {
1833                 err = re_node_set_add_intersect (&except_nodes, candidates,
1834                                                  dfa->inveclosures + cur_node);
1835                 if (BE (err != REG_NOERROR, 0))
1836                   {
1837                     re_node_set_free (&except_nodes);
1838                     return err;
1839                   }
1840               }
1841           }
1842       }
1843     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1844       {
1845         Idx cur_node = inv_eclosure->elems[ecl_idx];
1846         if (!re_node_set_contains (&except_nodes, cur_node))
1847           {
1848             Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1849             re_node_set_remove_at (dest_nodes, idx);
1850           }
1851       }
1852     re_node_set_free (&except_nodes);
1853     return REG_NOERROR;
1854 }
1855
1856 static int
1857 internal_function
1858 check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
1859                   Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1860 {
1861   re_dfa_t *const dfa = mctx->dfa;
1862   Idx lim_idx, src_pos, dst_pos;
1863
1864   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1865   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1866   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1867     {
1868       Idx subexp_idx;
1869       struct re_backref_cache_entry *ent;
1870       ent = mctx->bkref_ents + limits->elems[lim_idx];
1871       subexp_idx = dfa->nodes[ent->node].opr.idx;
1872
1873       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1874                                            subexp_idx, dst_node, dst_idx,
1875                                            dst_bkref_idx);
1876       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1877                                            subexp_idx, src_node, src_idx,
1878                                            src_bkref_idx);
1879
1880       /* In case of:
1881          <src> <dst> ( <subexp> )
1882          ( <subexp> ) <src> <dst>
1883          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1884       if (src_pos == dst_pos)
1885         continue; /* This is unrelated limitation.  */
1886       else
1887         return 1;
1888     }
1889   return 0;
1890 }
1891
1892 static int
1893 internal_function
1894 check_dst_limits_calc_pos_1 (re_match_context_t *mctx, int boundaries,
1895                              Idx subexp_idx, Idx from_node, Idx bkref_idx)
1896 {
1897   re_dfa_t *const dfa = mctx->dfa;
1898   re_node_set *eclosures = dfa->eclosures + from_node;
1899   Idx node_idx;
1900
1901   /* Else, we are on the boundary: examine the nodes on the epsilon
1902      closure.  */
1903   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1904     {
1905       Idx node = eclosures->elems[node_idx];
1906       switch (dfa->nodes[node].type)
1907         {
1908         case OP_BACK_REF:
1909           if (bkref_idx != REG_MISSING)
1910             {
1911               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1912               do
1913                 {
1914                   Idx dst;
1915                   int cpos;
1916
1917                   if (ent->node != node)
1918                     continue;
1919
1920                   if (subexp_idx
1921                       < CHAR_BIT * sizeof ent->eps_reachable_subexps_map
1922                       && !(ent->eps_reachable_subexps_map & (1u << subexp_idx)))
1923                     continue;
1924
1925                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1926                      OP_CLOSE_SUBEXP cases below.  But, if the
1927                      destination node is the same node as the source
1928                      node, don't recurse because it would cause an
1929                      infinite loop: a regex that exhibits this behavior
1930                      is ()\1*\1*  */
1931                   dst = dfa->edests[node].elems[0];
1932                   if (dst == from_node)
1933                     {
1934                       if (boundaries & 1)
1935                         return -1;
1936                       else /* if (boundaries & 2) */
1937                         return 0;
1938                     }
1939
1940                   cpos =
1941                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1942                                                  dst, bkref_idx);
1943                   if (cpos == -1 /* && (boundaries & 1) */)
1944                     return -1;
1945                   if (cpos == 0 && (boundaries & 2))
1946                     return 0;
1947
1948                   if (subexp_idx
1949                       < CHAR_BIT * sizeof ent->eps_reachable_subexps_map)
1950                     ent->eps_reachable_subexps_map &= ~(1u << subexp_idx);
1951                 }
1952               while (ent++->more);
1953             }
1954           break;
1955
1956         case OP_OPEN_SUBEXP:
1957           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1958             return -1;
1959           break;
1960
1961         case OP_CLOSE_SUBEXP:
1962           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1963             return 0;
1964           break;
1965
1966         default:
1967             break;
1968         }
1969     }
1970
1971   return (boundaries & 2) ? 1 : 0;
1972 }
1973
1974 static int
1975 internal_function
1976 check_dst_limits_calc_pos (re_match_context_t *mctx, Idx limit, Idx subexp_idx,
1977                            Idx from_node, Idx str_idx, Idx bkref_idx)
1978 {
1979   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1980   int boundaries;
1981
1982   /* If we are outside the range of the subexpression, return -1 or 1.  */
1983   if (str_idx < lim->subexp_from)
1984     return -1;
1985
1986   if (lim->subexp_to < str_idx)
1987     return 1;
1988
1989   /* If we are within the subexpression, return 0.  */
1990   boundaries = (str_idx == lim->subexp_from);
1991   boundaries |= (str_idx == lim->subexp_to) << 1;
1992   if (boundaries == 0)
1993     return 0;
1994
1995   /* Else, examine epsilon closure.  */
1996   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1997                                       from_node, bkref_idx);
1998 }
1999
2000 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2001    which are against limitations from DEST_NODES. */
2002
2003 static reg_errcode_t
2004 internal_function
2005 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2006                      const re_node_set *candidates, re_node_set *limits,
2007                      struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2008 {
2009   reg_errcode_t err;
2010   Idx node_idx, lim_idx;
2011
2012   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2013     {
2014       Idx subexp_idx;
2015       struct re_backref_cache_entry *ent;
2016       ent = bkref_ents + limits->elems[lim_idx];
2017
2018       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2019         continue; /* This is unrelated limitation.  */
2020
2021       subexp_idx = dfa->nodes[ent->node].opr.idx;
2022       if (ent->subexp_to == str_idx)
2023         {
2024           Idx ops_node = REG_MISSING;
2025           Idx cls_node = REG_MISSING;
2026           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2027             {
2028               Idx node = dest_nodes->elems[node_idx];
2029               re_token_type_t type = dfa->nodes[node].type;
2030               if (type == OP_OPEN_SUBEXP
2031                   && subexp_idx == dfa->nodes[node].opr.idx)
2032                 ops_node = node;
2033               else if (type == OP_CLOSE_SUBEXP
2034                        && subexp_idx == dfa->nodes[node].opr.idx)
2035                 cls_node = node;
2036             }
2037
2038           /* Check the limitation of the open subexpression.  */
2039           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2040           if (REG_VALID_INDEX (ops_node))
2041             {
2042               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2043                                            candidates);
2044               if (BE (err != REG_NOERROR, 0))
2045                 return err;
2046             }
2047
2048           /* Check the limitation of the close subexpression.  */
2049           if (REG_VALID_INDEX (cls_node))
2050             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2051               {
2052                 Idx node = dest_nodes->elems[node_idx];
2053                 if (!re_node_set_contains (dfa->inveclosures + node,
2054                                            cls_node)
2055                     && !re_node_set_contains (dfa->eclosures + node,
2056                                               cls_node))
2057                   {
2058                     /* It is against this limitation.
2059                        Remove it form the current sifted state.  */
2060                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2061                                                  candidates);
2062                     if (BE (err != REG_NOERROR, 0))
2063                       return err;
2064                     --node_idx;
2065                   }
2066               }
2067         }
2068       else /* (ent->subexp_to != str_idx)  */
2069         {
2070           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2071             {
2072               Idx node = dest_nodes->elems[node_idx];
2073               re_token_type_t type = dfa->nodes[node].type;
2074               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2075                 {
2076                   if (subexp_idx != dfa->nodes[node].opr.idx)
2077                     continue;
2078                   /* It is against this limitation.
2079                      Remove it form the current sifted state.  */
2080                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2081                                                candidates);
2082                   if (BE (err != REG_NOERROR, 0))
2083                     return err;
2084                 }
2085             }
2086         }
2087     }
2088   return REG_NOERROR;
2089 }
2090
2091 static reg_errcode_t
2092 internal_function
2093 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2094                    Idx str_idx, const re_node_set *candidates)
2095 {
2096   re_dfa_t *const dfa = mctx->dfa;
2097   reg_errcode_t err;
2098   Idx node_idx, node;
2099   re_sift_context_t local_sctx;
2100   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2101
2102   if (first_idx == REG_MISSING)
2103     return REG_NOERROR;
2104
2105   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2106
2107   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2108     {
2109       Idx enabled_idx;
2110       re_token_type_t type;
2111       struct re_backref_cache_entry *entry;
2112       node = candidates->elems[node_idx];
2113       type = dfa->nodes[node].type;
2114       /* Avoid infinite loop for the REs like "()\1+".  */
2115       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2116         continue;
2117       if (type != OP_BACK_REF)
2118         continue;
2119
2120       entry = mctx->bkref_ents + first_idx;
2121       enabled_idx = first_idx;
2122       do
2123         {
2124           Idx subexp_len, to_idx, dst_node, ret;
2125           re_dfastate_t *cur_state;
2126
2127           if (entry->node != node)
2128             continue;
2129           subexp_len = entry->subexp_to - entry->subexp_from;
2130           to_idx = str_idx + subexp_len;
2131           dst_node = (subexp_len ? dfa->nexts[node]
2132                       : dfa->edests[node].elems[0]);
2133
2134           if (to_idx > sctx->last_str_idx
2135               || sctx->sifted_states[to_idx] == NULL
2136               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2137               || check_dst_limits (mctx, &sctx->limits, node,
2138                                    str_idx, dst_node, to_idx))
2139             continue;
2140
2141           if (local_sctx.sifted_states == NULL)
2142             {
2143               local_sctx = *sctx;
2144               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2145               if (BE (err != REG_NOERROR, 0))
2146                 goto free_return;
2147             }
2148           local_sctx.last_node = node;
2149           local_sctx.last_str_idx = str_idx;
2150           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2151           if (BE (ret < 0, 0))
2152             {
2153               err = REG_ESPACE;
2154               goto free_return;
2155             }
2156           cur_state = local_sctx.sifted_states[str_idx];
2157           err = sift_states_backward (mctx, &local_sctx);
2158           if (BE (err != REG_NOERROR, 0))
2159             goto free_return;
2160           if (sctx->limited_states != NULL)
2161             {
2162               err = merge_state_array (dfa, sctx->limited_states,
2163                                        local_sctx.sifted_states,
2164                                        str_idx + 1);
2165               if (BE (err != REG_NOERROR, 0))
2166                 goto free_return;
2167             }
2168           local_sctx.sifted_states[str_idx] = cur_state;
2169           re_node_set_remove (&local_sctx.limits, enabled_idx);
2170
2171           /* mctx->bkref_ents may have changed, reload the pointer.  */
2172           entry = mctx->bkref_ents + enabled_idx;
2173         }
2174       while (enabled_idx++, entry++->more);
2175     }
2176   err = REG_NOERROR;
2177  free_return:
2178   if (local_sctx.sifted_states != NULL)
2179     {
2180       re_node_set_free (&local_sctx.limits);
2181     }
2182
2183   return err;
2184 }
2185
2186
2187 #ifdef RE_ENABLE_I18N
2188 static int
2189 internal_function
2190 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2191                      Idx node_idx, Idx str_idx, Idx max_str_idx)
2192 {
2193   re_dfa_t *const dfa = mctx->dfa;
2194   int naccepted;
2195   /* Check the node can accept `multi byte'.  */
2196   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2197   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2198       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2199                             dfa->nexts[node_idx]))
2200     /* The node can't accept the `multi byte', or the
2201        destination was already thrown away, then the node
2202        could't accept the current input `multi byte'.   */
2203     naccepted = 0;
2204   /* Otherwise, it is sure that the node could accept
2205      `naccepted' bytes input.  */
2206   return naccepted;
2207 }
2208 #endif /* RE_ENABLE_I18N */
2209
2210 \f
2211 /* Functions for state transition.  */
2212
2213 /* Return the next state to which the current state STATE will transit by
2214    accepting the current input byte, and update STATE_LOG if necessary.
2215    If STATE can accept a multibyte char/collating element/back reference
2216    update the destination of STATE_LOG.  */
2217
2218 static re_dfastate_t *
2219 internal_function
2220 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2221                re_dfastate_t *state)
2222 {
2223   re_dfastate_t **trtable;
2224   unsigned char ch;
2225
2226 #ifdef RE_ENABLE_I18N
2227   /* If the current state can accept multibyte.  */
2228   if (BE (state->accept_mb, 0))
2229     {
2230       *err = transit_state_mb (mctx, state);
2231       if (BE (*err != REG_NOERROR, 0))
2232         return NULL;
2233     }
2234 #endif /* RE_ENABLE_I18N */
2235
2236   /* Then decide the next state with the single byte.  */
2237 #if 0
2238   if (0)
2239     /* don't use transition table  */
2240     return transit_state_sb (err, mctx, state);
2241 #endif
2242
2243   /* Use transition table  */
2244   ch = re_string_fetch_byte (&mctx->input);
2245   for (;;)
2246     {
2247       trtable = state->trtable;
2248       if (BE (trtable != NULL, 1))
2249         return trtable[ch];
2250
2251       trtable = state->word_trtable;
2252       if (BE (trtable != NULL, 1))
2253         {
2254           unsigned int context;
2255           context
2256             = re_string_context_at (&mctx->input,
2257                                     re_string_cur_idx (&mctx->input) - 1,
2258                                     mctx->eflags);
2259           if (IS_WORD_CONTEXT (context))
2260             return trtable[ch + SBC_MAX];
2261           else
2262             return trtable[ch];
2263         }
2264
2265       if (!build_trtable (mctx->dfa, state))
2266         {
2267           *err = REG_ESPACE;
2268           return NULL;
2269         }
2270
2271       /* Retry, we now have a transition table.  */
2272     }
2273 }
2274
2275 /* Update the state_log if we need */
2276 re_dfastate_t *
2277 internal_function
2278 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2279                       re_dfastate_t *next_state)
2280 {
2281   re_dfa_t *const dfa = mctx->dfa;
2282   Idx cur_idx = re_string_cur_idx (&mctx->input);
2283
2284   if (cur_idx > mctx->state_log_top)
2285     {
2286       mctx->state_log[cur_idx] = next_state;
2287       mctx->state_log_top = cur_idx;
2288     }
2289   else if (mctx->state_log[cur_idx] == 0)
2290     {
2291       mctx->state_log[cur_idx] = next_state;
2292     }
2293   else
2294     {
2295       re_dfastate_t *pstate;
2296       unsigned int context;
2297       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2298       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2299          the destination of a multibyte char/collating element/
2300          back reference.  Then the next state is the union set of
2301          these destinations and the results of the transition table.  */
2302       pstate = mctx->state_log[cur_idx];
2303       log_nodes = pstate->entrance_nodes;
2304       if (next_state != NULL)
2305         {
2306           table_nodes = next_state->entrance_nodes;
2307           *err = re_node_set_init_union (&next_nodes, table_nodes,
2308                                              log_nodes);
2309           if (BE (*err != REG_NOERROR, 0))
2310             return NULL;
2311         }
2312       else
2313         next_nodes = *log_nodes;
2314       /* Note: We already add the nodes of the initial state,
2315          then we don't need to add them here.  */
2316
2317       context = re_string_context_at (&mctx->input,
2318                                       re_string_cur_idx (&mctx->input) - 1,
2319                                       mctx->eflags);
2320       next_state = mctx->state_log[cur_idx]
2321         = re_acquire_state_context (err, dfa, &next_nodes, context);
2322       /* We don't need to check errors here, since the return value of
2323          this function is next_state and ERR is already set.  */
2324
2325       if (table_nodes != NULL)
2326         re_node_set_free (&next_nodes);
2327     }
2328
2329   if (BE (dfa->nbackref, 0) && next_state != NULL)
2330     {
2331       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2332          later.  We must check them here, since the back references in the
2333          next state might use them.  */
2334       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2335                                         cur_idx);
2336       if (BE (*err != REG_NOERROR, 0))
2337         return NULL;
2338
2339       /* If the next state has back references.  */
2340       if (next_state->has_backref)
2341         {
2342           *err = transit_state_bkref (mctx, &next_state->nodes);
2343           if (BE (*err != REG_NOERROR, 0))
2344             return NULL;
2345           next_state = mctx->state_log[cur_idx];
2346         }
2347     }
2348
2349   return next_state;
2350 }
2351
2352 /* Skip bytes in the input that correspond to part of a
2353    multi-byte match, then look in the log for a state
2354    from which to restart matching.  */
2355 static re_dfastate_t *
2356 internal_function
2357 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2358 {
2359   re_dfastate_t *cur_state = NULL;
2360   do
2361     {
2362       Idx max = mctx->state_log_top;
2363       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2364
2365       do
2366         {
2367           if (++cur_str_idx > max)
2368             return NULL;
2369           re_string_skip_bytes (&mctx->input, 1);
2370         }
2371       while (mctx->state_log[cur_str_idx] == NULL);
2372
2373       cur_state = merge_state_with_log (err, mctx, NULL);
2374     }
2375   while (*err == REG_NOERROR && cur_state == NULL);
2376   return cur_state;
2377 }
2378
2379 /* Helper functions for transit_state.  */
2380
2381 /* From the node set CUR_NODES, pick up the nodes whose types are
2382    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2383    expression. And register them to use them later for evaluating the
2384    correspoding back references.  */
2385
2386 static reg_errcode_t
2387 internal_function
2388 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2389                            Idx str_idx)
2390 {
2391   re_dfa_t *const dfa = mctx->dfa;
2392   Idx node_idx;
2393   reg_errcode_t err;
2394
2395   /* TODO: This isn't efficient.
2396            Because there might be more than one nodes whose types are
2397            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2398            nodes.
2399            E.g. RE: (a){2}  */
2400   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2401     {
2402       Idx node = cur_nodes->elems[node_idx];
2403       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2404           && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map
2405           && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx))
2406         {
2407           err = match_ctx_add_subtop (mctx, node, str_idx);
2408           if (BE (err != REG_NOERROR, 0))
2409             return err;
2410         }
2411     }
2412   return REG_NOERROR;
2413 }
2414
2415 #if 0
2416 /* Return the next state to which the current state STATE will transit by
2417    accepting the current input byte.  */
2418
2419 static re_dfastate_t *
2420 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2421                   re_dfastate_t *state)
2422 {
2423   re_dfa_t *const dfa = mctx->dfa;
2424   re_node_set next_nodes;
2425   re_dfastate_t *next_state;
2426   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2427   unsigned int context;
2428
2429   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2430   if (BE (*err != REG_NOERROR, 0))
2431     return NULL;
2432   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2433     {
2434       Idx cur_node = state->nodes.elems[node_cnt];
2435       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2436         {
2437           *err = re_node_set_merge (&next_nodes,
2438                                     dfa->eclosures + dfa->nexts[cur_node]);
2439           if (BE (*err != REG_NOERROR, 0))
2440             {
2441               re_node_set_free (&next_nodes);
2442               return NULL;
2443             }
2444         }
2445     }
2446   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2447   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2448   /* We don't need to check errors here, since the return value of
2449      this function is next_state and ERR is already set.  */
2450
2451   re_node_set_free (&next_nodes);
2452   re_string_skip_bytes (&mctx->input, 1);
2453   return next_state;
2454 }
2455 #endif
2456
2457 #ifdef RE_ENABLE_I18N
2458 static reg_errcode_t
2459 internal_function
2460 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2461 {
2462   re_dfa_t *const dfa = mctx->dfa;
2463   reg_errcode_t err;
2464   Idx i;
2465
2466   for (i = 0; i < pstate->nodes.nelem; ++i)
2467     {
2468       re_node_set dest_nodes, *new_nodes;
2469       Idx cur_node_idx = pstate->nodes.elems[i];
2470       int naccepted;
2471       Idx dest_idx;
2472       unsigned int context;
2473       re_dfastate_t *dest_state;
2474
2475       if (!dfa->nodes[cur_node_idx].accept_mb)
2476         continue;
2477
2478       if (dfa->nodes[cur_node_idx].constraint)
2479         {
2480           context = re_string_context_at (&mctx->input,
2481                                           re_string_cur_idx (&mctx->input),
2482                                           mctx->eflags);
2483           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2484                                            context))
2485             continue;
2486         }
2487
2488       /* How many bytes the node can accept?  */
2489       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2490                                            re_string_cur_idx (&mctx->input));
2491       if (naccepted == 0)
2492         continue;
2493
2494       /* The node can accepts `naccepted' bytes.  */
2495       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2496       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2497                                : mctx->max_mb_elem_len);
2498       err = clean_state_log_if_needed (mctx, dest_idx);
2499       if (BE (err != REG_NOERROR, 0))
2500         return err;
2501 #ifdef DEBUG
2502       assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2503 #endif
2504       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2505
2506       dest_state = mctx->state_log[dest_idx];
2507       if (dest_state == NULL)
2508         dest_nodes = *new_nodes;
2509       else
2510         {
2511           err = re_node_set_init_union (&dest_nodes,
2512                                         dest_state->entrance_nodes, new_nodes);
2513           if (BE (err != REG_NOERROR, 0))
2514             return err;
2515         }
2516       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2517       mctx->state_log[dest_idx]
2518         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2519       if (dest_state != NULL)
2520         re_node_set_free (&dest_nodes);
2521       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2522         return err;
2523     }
2524   return REG_NOERROR;
2525 }
2526 #endif /* RE_ENABLE_I18N */
2527
2528 static reg_errcode_t
2529 internal_function
2530 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2531 {
2532   re_dfa_t *const dfa = mctx->dfa;
2533   reg_errcode_t err;
2534   Idx i;
2535   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2536
2537   for (i = 0; i < nodes->nelem; ++i)
2538     {
2539       Idx dest_str_idx, prev_nelem, bkc_idx;
2540       Idx node_idx = nodes->elems[i];
2541       unsigned int context;
2542       const re_token_t *node = dfa->nodes + node_idx;
2543       re_node_set *new_dest_nodes;
2544
2545       /* Check whether `node' is a backreference or not.  */
2546       if (node->type != OP_BACK_REF)
2547         continue;
2548
2549       if (node->constraint)
2550         {
2551           context = re_string_context_at (&mctx->input, cur_str_idx,
2552                                           mctx->eflags);
2553           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2554             continue;
2555         }
2556
2557       /* `node' is a backreference.
2558          Check the substring which the substring matched.  */
2559       bkc_idx = mctx->nbkref_ents;
2560       err = get_subexp (mctx, node_idx, cur_str_idx);
2561       if (BE (err != REG_NOERROR, 0))
2562         goto free_return;
2563
2564       /* And add the epsilon closures (which is `new_dest_nodes') of
2565          the backreference to appropriate state_log.  */
2566 #ifdef DEBUG
2567       assert (dfa->nexts[node_idx] != REG_MISSING);
2568 #endif
2569       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2570         {
2571           Idx subexp_len;
2572           re_dfastate_t *dest_state;
2573           struct re_backref_cache_entry *bkref_ent;
2574           bkref_ent = mctx->bkref_ents + bkc_idx;
2575           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2576             continue;
2577           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2578           new_dest_nodes = (subexp_len == 0
2579                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2580                             : dfa->eclosures + dfa->nexts[node_idx]);
2581           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2582                           - bkref_ent->subexp_from);
2583           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2584                                           mctx->eflags);
2585           dest_state = mctx->state_log[dest_str_idx];
2586           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2587                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2588           /* Add `new_dest_node' to state_log.  */
2589           if (dest_state == NULL)
2590             {
2591               mctx->state_log[dest_str_idx]
2592                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2593                                             context);
2594               if (BE (mctx->state_log[dest_str_idx] == NULL
2595                       && err != REG_NOERROR, 0))
2596                 goto free_return;
2597             }
2598           else
2599             {
2600               re_node_set dest_nodes;
2601               err = re_node_set_init_union (&dest_nodes,
2602                                             dest_state->entrance_nodes,
2603                                             new_dest_nodes);
2604               if (BE (err != REG_NOERROR, 0))
2605                 {
2606                   re_node_set_free (&dest_nodes);
2607                   goto free_return;
2608                 }
2609               mctx->state_log[dest_str_idx]
2610                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2611               re_node_set_free (&dest_nodes);
2612               if (BE (mctx->state_log[dest_str_idx] == NULL
2613                       && err != REG_NOERROR, 0))
2614                 goto free_return;
2615             }
2616           /* We need to check recursively if the backreference can epsilon
2617              transit.  */
2618           if (subexp_len == 0
2619               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2620             {
2621               err = check_subexp_matching_top (mctx, new_dest_nodes,
2622                                                cur_str_idx);
2623               if (BE (err != REG_NOERROR, 0))
2624                 goto free_return;
2625               err = transit_state_bkref (mctx, new_dest_nodes);
2626               if (BE (err != REG_NOERROR, 0))
2627                 goto free_return;
2628             }
2629         }
2630     }
2631   err = REG_NOERROR;
2632  free_return:
2633   return err;
2634 }
2635
2636 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2637    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2638    Note that we might collect inappropriate candidates here.
2639    However, the cost of checking them strictly here is too high, then we
2640    delay these checking for prune_impossible_nodes().  */
2641
2642 static reg_errcode_t
2643 internal_function
2644 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2645 {
2646   re_dfa_t *const dfa = mctx->dfa;
2647   Idx subexp_num, sub_top_idx;
2648   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2649   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2650   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2651   if (cache_idx != REG_MISSING)
2652     {
2653       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2654       do
2655         if (entry->node == bkref_node)
2656           return REG_NOERROR; /* We already checked it.  */
2657       while (entry++->more);
2658     }
2659
2660   subexp_num = dfa->nodes[bkref_node].opr.idx;
2661
2662   /* For each sub expression  */
2663   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2664     {
2665       reg_errcode_t err;
2666       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2667       re_sub_match_last_t *sub_last;
2668       Idx sub_last_idx, sl_str, bkref_str_off;
2669
2670       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2671         continue; /* It isn't related.  */
2672
2673       sl_str = sub_top->str_idx;
2674       bkref_str_off = bkref_str_idx;
2675       /* At first, check the last node of sub expressions we already
2676          evaluated.  */
2677       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2678         {
2679           regoff_t sl_str_diff;
2680           sub_last = sub_top->lasts[sub_last_idx];
2681           sl_str_diff = sub_last->str_idx - sl_str;
2682           /* The matched string by the sub expression match with the substring
2683              at the back reference?  */
2684           if (sl_str_diff > 0)
2685             {
2686               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2687                 {
2688                   /* Not enough chars for a successful match.  */
2689                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2690                     break;
2691
2692                   err = clean_state_log_if_needed (mctx,
2693                                                    bkref_str_off
2694                                                    + sl_str_diff);
2695                   if (BE (err != REG_NOERROR, 0))
2696                     return err;
2697                   buf = (const char *) re_string_get_buffer (&mctx->input);
2698                 }
2699               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2700                 break; /* We don't need to search this sub expression any more.  */
2701             }
2702           bkref_str_off += sl_str_diff;
2703           sl_str += sl_str_diff;
2704           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2705                                 bkref_str_idx);
2706
2707           /* Reload buf, since the preceding call might have reallocated
2708              the buffer.  */
2709           buf = (const char *) re_string_get_buffer (&mctx->input);
2710
2711           if (err == REG_NOMATCH)
2712             continue;
2713           if (BE (err != REG_NOERROR, 0))
2714             return err;
2715         }
2716
2717       if (sub_last_idx < sub_top->nlasts)
2718         continue;
2719       if (sub_last_idx > 0)
2720         ++sl_str;
2721       /* Then, search for the other last nodes of the sub expression.  */
2722       for (; sl_str <= bkref_str_idx; ++sl_str)
2723         {
2724           Idx cls_node;
2725           regoff_t sl_str_off;
2726           const re_node_set *nodes;
2727           sl_str_off = sl_str - sub_top->str_idx;
2728           /* The matched string by the sub expression match with the substring
2729              at the back reference?  */
2730           if (sl_str_off > 0)
2731             {
2732               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2733                 {
2734                   /* If we are at the end of the input, we cannot match.  */
2735                   if (bkref_str_off >= mctx->input.len)
2736                     break;
2737
2738                   err = extend_buffers (mctx);
2739                   if (BE (err != REG_NOERROR, 0))
2740                     return err;
2741
2742                   buf = (const char *) re_string_get_buffer (&mctx->input);
2743                 }
2744               if (buf [bkref_str_off++] != buf[sl_str - 1])
2745                 break; /* We don't need to search this sub expression
2746                           any more.  */
2747             }
2748           if (mctx->state_log[sl_str] == NULL)
2749             continue;
2750           /* Does this state have a ')' of the sub expression?  */
2751           nodes = &mctx->state_log[sl_str]->nodes;
2752           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2753           if (cls_node == REG_MISSING)
2754             continue; /* No.  */
2755           if (sub_top->path == NULL)
2756             {
2757               sub_top->path = re_calloc (state_array_t,
2758                                          sl_str - sub_top->str_idx + 1);
2759               if (sub_top->path == NULL)
2760                 return REG_ESPACE;
2761             }
2762           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2763              in the current context?  */
2764           err = check_arrival (mctx, sub_top->path, sub_top->node,
2765                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2766           if (err == REG_NOMATCH)
2767               continue;
2768           if (BE (err != REG_NOERROR, 0))
2769               return err;
2770           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2771           if (BE (sub_last == NULL, 0))
2772             return REG_ESPACE;
2773           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2774                                 bkref_str_idx);
2775           if (err == REG_NOMATCH)
2776             continue;
2777         }
2778     }
2779   return REG_NOERROR;
2780 }
2781
2782 /* Helper functions for get_subexp().  */
2783
2784 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2785    If it can arrive, register the sub expression expressed with SUB_TOP
2786    and SUB_LAST.  */
2787
2788 static reg_errcode_t
2789 internal_function
2790 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2791                 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2792 {
2793   reg_errcode_t err;
2794   Idx to_idx;
2795   /* Can the subexpression arrive the back reference?  */
2796   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2797                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2798   if (err != REG_NOERROR)
2799     return err;
2800   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2801                              sub_last->str_idx);
2802   if (BE (err != REG_NOERROR, 0))
2803     return err;
2804   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2805   return clean_state_log_if_needed (mctx, to_idx);
2806 }
2807
2808 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2809    Search '(' if FL_OPEN, or search ')' otherwise.
2810    TODO: This function isn't efficient...
2811          Because there might be more than one nodes whose types are
2812          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2813          nodes.
2814          E.g. RE: (a){2}  */
2815
2816 static Idx
2817 internal_function
2818 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2819                   Idx subexp_idx, int type)
2820 {
2821   Idx cls_idx;
2822   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2823     {
2824       Idx cls_node = nodes->elems[cls_idx];
2825       const re_token_t *node = dfa->nodes + cls_node;
2826       if (node->type == type
2827           && node->opr.idx == subexp_idx)
2828         return cls_node;
2829     }
2830   return REG_MISSING;
2831 }
2832
2833 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2834    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2835    heavily reused.
2836    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2837
2838 static reg_errcode_t
2839 internal_function
2840 check_arrival (re_match_context_t *mctx, state_array_t *path,
2841                Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2842                int type)
2843 {
2844   re_dfa_t *const dfa = mctx->dfa;
2845   reg_errcode_t err;
2846   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2847   re_dfastate_t *cur_state = NULL;
2848   re_node_set *cur_nodes, next_nodes;
2849   re_dfastate_t **backup_state_log;
2850   unsigned int context;
2851
2852   subexp_num = dfa->nodes[top_node].opr.idx;
2853   /* Extend the buffer if we need.  */
2854   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2855     {
2856       re_dfastate_t **new_array;
2857       Idx old_alloc = path->alloc;
2858       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2859       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2860       if (new_array == NULL)
2861         {
2862           path->alloc = old_alloc;
2863           return REG_ESPACE;
2864         }
2865       path->array = new_array;
2866       memset (new_array + old_alloc, '\0',
2867               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2868     }
2869
2870   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2871
2872   /* Temporary modify MCTX.  */
2873   backup_state_log = mctx->state_log;
2874   backup_cur_idx = mctx->input.cur_idx;
2875   mctx->state_log = path->array;
2876   mctx->input.cur_idx = str_idx;
2877
2878   /* Setup initial node set.  */
2879   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2880   if (str_idx == top_str)
2881     {
2882       err = re_node_set_init_1 (&next_nodes, top_node);
2883       if (BE (err != REG_NOERROR, 0))
2884         return err;
2885       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2886       if (BE (err != REG_NOERROR, 0))
2887         {
2888           re_node_set_free (&next_nodes);
2889           return err;
2890         }
2891     }
2892   else
2893     {
2894       cur_state = mctx->state_log[str_idx];
2895       if (cur_state && cur_state->has_backref)
2896         {
2897           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2898           if (BE ( err != REG_NOERROR, 0))
2899             return err;
2900         }
2901       else
2902         re_node_set_init_empty (&next_nodes);
2903     }
2904   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2905     {
2906       if (next_nodes.nelem)
2907         {
2908           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2909                                     subexp_num, type);
2910           if (BE ( err != REG_NOERROR, 0))
2911             {
2912               re_node_set_free (&next_nodes);
2913               return err;
2914             }
2915         }
2916       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2917       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2918         {
2919           re_node_set_free (&next_nodes);
2920           return err;
2921         }
2922       mctx->state_log[str_idx] = cur_state;
2923     }
2924
2925   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2926     {
2927       re_node_set_empty (&next_nodes);
2928       if (mctx->state_log[str_idx + 1])
2929         {
2930           err = re_node_set_merge (&next_nodes,
2931                                    &mctx->state_log[str_idx + 1]->nodes);
2932           if (BE (err != REG_NOERROR, 0))
2933             {
2934               re_node_set_free (&next_nodes);
2935               return err;
2936             }
2937         }
2938       if (cur_state)
2939         {
2940           err = check_arrival_add_next_nodes (mctx, str_idx,
2941                                               &cur_state->non_eps_nodes, &next_nodes);
2942           if (BE (err != REG_NOERROR, 0))
2943             {
2944               re_node_set_free (&next_nodes);
2945               return err;
2946             }
2947         }
2948       ++str_idx;
2949       if (next_nodes.nelem)
2950         {
2951           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2952           if (BE (err != REG_NOERROR, 0))
2953             {
2954               re_node_set_free (&next_nodes);
2955               return err;
2956             }
2957           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2958                                     subexp_num, type);
2959           if (BE ( err != REG_NOERROR, 0))
2960             {
2961               re_node_set_free (&next_nodes);
2962               return err;
2963             }
2964         }
2965       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2966       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2967       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2968         {
2969           re_node_set_free (&next_nodes);
2970           return err;
2971         }
2972       mctx->state_log[str_idx] = cur_state;
2973       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2974     }
2975   re_node_set_free (&next_nodes);
2976   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2977                : &mctx->state_log[last_str]->nodes);
2978   path->next_idx = str_idx;
2979
2980   /* Fix MCTX.  */
2981   mctx->state_log = backup_state_log;
2982   mctx->input.cur_idx = backup_cur_idx;
2983
2984   /* Then check the current node set has the node LAST_NODE.  */
2985   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2986     return REG_NOERROR;
2987
2988   return REG_NOMATCH;
2989 }
2990
2991 /* Helper functions for check_arrival.  */
2992
2993 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2994    to NEXT_NODES.
2995    TODO: This function is similar to the functions transit_state*(),
2996          however this function has many additional works.
2997          Can't we unify them?  */
2998
2999 static reg_errcode_t
3000 internal_function
3001 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3002                               re_node_set *cur_nodes,
3003                               re_node_set *next_nodes)
3004 {
3005   re_dfa_t *const dfa = mctx->dfa;
3006   int result;
3007   Idx cur_idx;
3008   reg_errcode_t err;
3009   re_node_set union_set;
3010   re_node_set_init_empty (&union_set);
3011   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3012     {
3013       int naccepted = 0;
3014       Idx cur_node = cur_nodes->elems[cur_idx];
3015 #ifdef DEBUG
3016       re_token_type_t type = dfa->nodes[cur_node].type;
3017       assert (!IS_EPSILON_NODE (type));
3018 #endif
3019 #ifdef RE_ENABLE_I18N
3020       /* If the node may accept `multi byte'.  */
3021       if (dfa->nodes[cur_node].accept_mb)
3022         {
3023           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3024                                                str_idx);
3025           if (naccepted > 1)
3026             {
3027               re_dfastate_t *dest_state;
3028               Idx next_node = dfa->nexts[cur_node];
3029               Idx next_idx = str_idx + naccepted;
3030               dest_state = mctx->state_log[next_idx];
3031               re_node_set_empty (&union_set);
3032               if (dest_state)
3033                 {
3034                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3035                   if (BE (err != REG_NOERROR, 0))
3036                     {
3037                       re_node_set_free (&union_set);
3038                       return err;
3039                     }
3040                 }
3041               result = re_node_set_insert (&union_set, next_node);
3042               if (BE (result < 0, 0))
3043                 {
3044                   re_node_set_free (&union_set);
3045                   return REG_ESPACE;
3046                 }
3047               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3048                                                             &union_set);
3049               if (BE (mctx->state_log[next_idx] == NULL
3050                       && err != REG_NOERROR, 0))
3051                 {
3052                   re_node_set_free (&union_set);
3053                   return err;
3054                 }
3055             }
3056         }
3057 #endif /* RE_ENABLE_I18N */
3058       if (naccepted
3059           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3060         {
3061           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3062           if (BE (result < 0, 0))
3063             {
3064               re_node_set_free (&union_set);
3065               return REG_ESPACE;
3066             }
3067         }
3068     }
3069   re_node_set_free (&union_set);
3070   return REG_NOERROR;
3071 }
3072
3073 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3074    CUR_NODES, however exclude the nodes which are:
3075     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3076     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3077 */
3078
3079 static reg_errcode_t
3080 internal_function
3081 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3082                           Idx ex_subexp, int type)
3083 {
3084   reg_errcode_t err;
3085   Idx idx, outside_node;
3086   re_node_set new_nodes;
3087 #ifdef DEBUG
3088   assert (cur_nodes->nelem);
3089 #endif
3090   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3091   if (BE (err != REG_NOERROR, 0))
3092     return err;
3093   /* Create a new node set NEW_NODES with the nodes which are epsilon
3094      closures of the node in CUR_NODES.  */
3095
3096   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3097     {
3098       Idx cur_node = cur_nodes->elems[idx];
3099       re_node_set *eclosure = dfa->eclosures + cur_node;
3100       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3101       if (outside_node == REG_MISSING)
3102         {
3103           /* There are no problematic nodes, just merge them.  */
3104           err = re_node_set_merge (&new_nodes, eclosure);
3105           if (BE (err != REG_NOERROR, 0))
3106             {
3107               re_node_set_free (&new_nodes);
3108               return err;
3109             }
3110         }
3111       else
3112         {
3113           /* There are problematic nodes, re-calculate incrementally.  */
3114           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3115                                               ex_subexp, type);
3116           if (BE (err != REG_NOERROR, 0))
3117             {
3118               re_node_set_free (&new_nodes);
3119               return err;
3120             }
3121         }
3122     }
3123   re_node_set_free (cur_nodes);
3124   *cur_nodes = new_nodes;
3125   return REG_NOERROR;
3126 }
3127
3128 /* Helper function for check_arrival_expand_ecl.
3129    Check incrementally the epsilon closure of TARGET, and if it isn't
3130    problematic append it to DST_NODES.  */
3131
3132 static reg_errcode_t
3133 internal_function
3134 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3135                               Idx target, Idx ex_subexp, int type)
3136 {
3137   Idx cur_node;
3138   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3139     {
3140       int err;
3141
3142       if (dfa->nodes[cur_node].type == type
3143           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3144         {
3145           if (type == OP_CLOSE_SUBEXP)
3146             {
3147               err = re_node_set_insert (dst_nodes, cur_node);
3148               if (BE (err == -1, 0))
3149                 return REG_ESPACE;
3150             }
3151           break;
3152         }
3153       err = re_node_set_insert (dst_nodes, cur_node);
3154       if (BE (err == -1, 0))
3155         return REG_ESPACE;
3156       if (dfa->edests[cur_node].nelem == 0)
3157         break;
3158       if (dfa->edests[cur_node].nelem == 2)
3159         {
3160           reg_errcode_t ret =
3161             check_arrival_expand_ecl_sub (dfa, dst_nodes,
3162                                           dfa->edests[cur_node].elems[1],
3163                                           ex_subexp, type);
3164           if (BE (ret != REG_NOERROR, 0))
3165             return ret;
3166         }
3167       cur_node = dfa->edests[cur_node].elems[0];
3168     }
3169   return REG_NOERROR;
3170 }
3171
3172
3173 /* For all the back references in the current state, calculate the
3174    destination of the back references by the appropriate entry
3175    in MCTX->BKREF_ENTS.  */
3176
3177 static reg_errcode_t
3178 internal_function
3179 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3180                     Idx cur_str, Idx subexp_num, int type)
3181 {
3182   re_dfa_t *const dfa = mctx->dfa;
3183   reg_errcode_t err;
3184   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3185   struct re_backref_cache_entry *ent;
3186
3187   if (cache_idx_start == REG_MISSING)
3188     return REG_NOERROR;
3189
3190  restart:
3191   ent = mctx->bkref_ents + cache_idx_start;
3192   do
3193     {
3194       Idx to_idx, next_node;
3195
3196       /* Is this entry ENT is appropriate?  */
3197       if (!re_node_set_contains (cur_nodes, ent->node))
3198         continue; /* No.  */
3199
3200       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3201       /* Calculate the destination of the back reference, and append it
3202          to MCTX->STATE_LOG.  */
3203       if (to_idx == cur_str)
3204         {
3205           /* The backreference did epsilon transit, we must re-check all the
3206              node in the current state.  */
3207           re_node_set new_dests;
3208           reg_errcode_t err2, err3;
3209           next_node = dfa->edests[ent->node].elems[0];
3210           if (re_node_set_contains (cur_nodes, next_node))
3211             continue;
3212           err = re_node_set_init_1 (&new_dests, next_node);
3213           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3214           err3 = re_node_set_merge (cur_nodes, &new_dests);
3215           re_node_set_free (&new_dests);
3216           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3217                   || err3 != REG_NOERROR, 0))
3218             {
3219               err = (err != REG_NOERROR ? err
3220                      : (err2 != REG_NOERROR ? err2 : err3));
3221               return err;
3222             }
3223           /* TODO: It is still inefficient...  */
3224           goto restart;
3225         }
3226       else
3227         {
3228           re_node_set union_set;
3229           next_node = dfa->nexts[ent->node];
3230           if (mctx->state_log[to_idx])
3231             {
3232               int ret;
3233               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3234                                         next_node))
3235                 continue;
3236               err = re_node_set_init_copy (&union_set,
3237                                            &mctx->state_log[to_idx]->nodes);
3238               ret = re_node_set_insert (&union_set, next_node);
3239               if (BE (err != REG_NOERROR || ret < 0, 0))
3240                 {
3241                   re_node_set_free (&union_set);
3242                   err = err != REG_NOERROR ? err : REG_ESPACE;
3243                   return err;
3244                 }
3245             }
3246           else
3247             {
3248               err = re_node_set_init_1 (&union_set, next_node);
3249               if (BE (err != REG_NOERROR, 0))
3250                 return err;
3251             }
3252           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3253           re_node_set_free (&union_set);
3254           if (BE (mctx->state_log[to_idx] == NULL
3255                   && err != REG_NOERROR, 0))
3256             return err;
3257         }
3258     }
3259   while (ent++->more);
3260   return REG_NOERROR;
3261 }
3262
3263 /* Build transition table for the state.
3264    Return 1 if succeeded, otherwise return NULL.  */
3265
3266 static int
3267 internal_function
3268 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3269 {
3270   reg_errcode_t err;
3271   Idx i, j;
3272   int ch, need_word_trtable = 0;
3273   unsigned int elem, mask;
3274   int dests_node_malloced = 0, dest_states_malloced = 0;
3275   Idx ndests; /* Number of the destination states from `state'.  */
3276   re_dfastate_t **trtable;
3277   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3278   re_node_set follows, *dests_node;
3279   bitset *dests_ch;
3280   bitset acceptable;
3281
3282   /* We build DFA states which corresponds to the destination nodes
3283      from `state'.  `dests_node[i]' represents the nodes which i-th
3284      destination state contains, and `dests_ch[i]' represents the
3285      characters which i-th destination state accepts.  */
3286   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3287     dests_node = (re_node_set *)
3288       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3289   else
3290     {
3291       dests_node = (re_node_set *)
3292         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3293       if (BE (dests_node == NULL, 0))
3294         return 0;
3295       dests_node_malloced = 1;
3296     }
3297   dests_ch = (bitset *) (dests_node + SBC_MAX);
3298
3299   /* Initialize transiton table.  */
3300   state->word_trtable = state->trtable = NULL;
3301
3302   /* At first, group all nodes belonging to `state' into several
3303      destinations.  */
3304   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3305   if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3306     {
3307       if (dests_node_malloced)
3308         free (dests_node);
3309       /* Return 0 in case of an error, 1 otherwise.  */
3310       if (ndests == 0)
3311         {
3312           state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3313           return 1;
3314         }
3315       return 0;
3316     }
3317
3318   err = re_node_set_alloc (&follows, ndests + 1);
3319   if (BE (err != REG_NOERROR, 0))
3320     goto out_free;
3321
3322   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3323                          + ndests * 3 * sizeof (re_dfastate_t *)))
3324     dest_states = (re_dfastate_t **)
3325       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3326   else
3327     {
3328       dest_states = (re_dfastate_t **)
3329         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3330       if (BE (dest_states == NULL, 0))
3331         {
3332 out_free:
3333           if (dest_states_malloced)
3334             free (dest_states);
3335           re_node_set_free (&follows);
3336           for (i = 0; i < ndests; ++i)
3337             re_node_set_free (dests_node + i);
3338           if (dests_node_malloced)
3339             free (dests_node);
3340           return 0;
3341         }
3342       dest_states_malloced = 1;
3343     }
3344   dest_states_word = dest_states + ndests;
3345   dest_states_nl = dest_states_word + ndests;
3346   bitset_empty (acceptable);
3347
3348   /* Then build the states for all destinations.  */
3349   for (i = 0; i < ndests; ++i)
3350     {
3351       Idx next_node;
3352       re_node_set_empty (&follows);
3353       /* Merge the follows of this destination states.  */
3354       for (j = 0; j < dests_node[i].nelem; ++j)
3355         {
3356           next_node = dfa->nexts[dests_node[i].elems[j]];
3357           if (next_node != REG_MISSING)
3358             {
3359               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3360               if (BE (err != REG_NOERROR, 0))
3361                 goto out_free;
3362             }
3363         }
3364       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3365       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3366         goto out_free;
3367       /* If the new state has context constraint,
3368          build appropriate states for these contexts.  */
3369       if (dest_states[i]->has_constraint)
3370         {
3371           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3372                                                           CONTEXT_WORD);
3373           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3374             goto out_free;
3375
3376           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3377             need_word_trtable = 1;
3378
3379           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3380                                                         CONTEXT_NEWLINE);
3381           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3382             goto out_free;
3383         }
3384       else
3385         {
3386           dest_states_word[i] = dest_states[i];
3387           dest_states_nl[i] = dest_states[i];
3388         }
3389       bitset_merge (acceptable, dests_ch[i]);
3390     }
3391
3392   if (!BE (need_word_trtable, 0))
3393     {
3394       /* We don't care about whether the following character is a word
3395          character, or we are in a single-byte character set so we can
3396          discern by looking at the character code: allocate a
3397          256-entry transition table.  */
3398       trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3399       if (BE (trtable == NULL, 0))
3400         goto out_free;
3401
3402       /* For all characters ch...:  */
3403       for (i = 0; i < BITSET_UINTS; ++i)
3404         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3405              elem;
3406              mask <<= 1, elem >>= 1, ++ch)
3407           if (BE (elem & 1, 0))
3408             {
3409               /* There must be exactly one destination which accepts
3410                  character ch.  See group_nodes_into_DFAstates.  */
3411               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3412                 ;
3413
3414               /* j-th destination accepts the word character ch.  */
3415               if (dfa->word_char[i] & mask)
3416                 trtable[ch] = dest_states_word[j];
3417               else
3418                 trtable[ch] = dest_states[j];
3419             }
3420     }
3421   else
3422     {
3423       /* We care about whether the following character is a word
3424          character, and we are in a multi-byte character set: discern
3425          by looking at the character code: build two 256-entry
3426          transition tables, one starting at trtable[0] and one
3427          starting at trtable[SBC_MAX].  */
3428       trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3429       if (BE (trtable == NULL, 0))
3430         goto out_free;
3431
3432       /* For all characters ch...:  */
3433       for (i = 0; i < BITSET_UINTS; ++i)
3434         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3435              elem;
3436              mask <<= 1, elem >>= 1, ++ch)
3437           if (BE (elem & 1, 0))
3438             {
3439               /* There must be exactly one destination which accepts
3440                  character ch.  See group_nodes_into_DFAstates.  */
3441               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3442                 ;
3443
3444               /* j-th destination accepts the word character ch.  */
3445               trtable[ch] = dest_states[j];
3446               trtable[ch + SBC_MAX] = dest_states_word[j];
3447             }
3448     }
3449
3450   /* new line */
3451   if (bitset_contain (acceptable, NEWLINE_CHAR))
3452     {
3453       /* The current state accepts newline character.  */
3454       for (j = 0; j < ndests; ++j)
3455         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3456           {
3457             /* k-th destination accepts newline character.  */
3458             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3459             if (need_word_trtable)
3460               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3461             /* There must be only one destination which accepts
3462                newline.  See group_nodes_into_DFAstates.  */
3463             break;
3464           }
3465     }
3466
3467   if (dest_states_malloced)
3468     free (dest_states);
3469
3470   re_node_set_free (&follows);
3471   for (i = 0; i < ndests; ++i)
3472     re_node_set_free (dests_node + i);
3473
3474   if (dests_node_malloced)
3475     free (dests_node);
3476
3477   return 1;
3478 }
3479
3480 /* Group all nodes belonging to STATE into several destinations.
3481    Then for all destinations, set the nodes belonging to the destination
3482    to DESTS_NODE[i] and set the characters accepted by the destination
3483    to DEST_CH[i].  This function return the number of destinations.  */
3484
3485 static Idx
3486 internal_function
3487 group_nodes_into_DFAstates (re_dfa_t *dfa, const re_dfastate_t *state,
3488                             re_node_set *dests_node, bitset *dests_ch)
3489 {
3490   reg_errcode_t err;
3491   int result;
3492   Idx i, j, k;
3493   Idx ndests; /* Number of the destinations from `state'.  */
3494   bitset accepts; /* Characters a node can accept.  */
3495   const re_node_set *cur_nodes = &state->nodes;
3496   bitset_empty (accepts);
3497   ndests = 0;
3498
3499   /* For all the nodes belonging to `state',  */
3500   for (i = 0; i < cur_nodes->nelem; ++i)
3501     {
3502       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3503       re_token_type_t type = node->type;
3504       unsigned int constraint = node->constraint;
3505
3506       /* Enumerate all single byte character this node can accept.  */
3507       if (type == CHARACTER)
3508         bitset_set (accepts, node->opr.c);
3509       else if (type == SIMPLE_BRACKET)
3510         {
3511           bitset_merge (accepts, node->opr.sbcset);
3512         }
3513       else if (type == OP_PERIOD)
3514         {
3515 #ifdef RE_ENABLE_I18N
3516           if (dfa->mb_cur_max > 1)
3517             bitset_merge (accepts, dfa->sb_char);
3518           else
3519 #endif
3520             bitset_set_all (accepts);
3521           if (!(dfa->syntax & REG_DOT_NEWLINE))
3522             bitset_clear (accepts, '\n');
3523           if (dfa->syntax & REG_DOT_NOT_NULL)
3524             bitset_clear (accepts, '\0');
3525         }
3526 #ifdef RE_ENABLE_I18N
3527       else if (type == OP_UTF8_PERIOD)
3528         {
3529           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3530           if (!(dfa->syntax & REG_DOT_NEWLINE))
3531             bitset_clear (accepts, '\n');
3532           if (dfa->syntax & REG_DOT_NOT_NULL)
3533             bitset_clear (accepts, '\0');
3534         }
3535 #endif
3536       else
3537         continue;
3538
3539       /* Check the `accepts' and sift the characters which are not
3540          match it the context.  */
3541       if (constraint)
3542         {
3543           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3544             {
3545               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3546               bitset_empty (accepts);
3547               if (accepts_newline)
3548                 bitset_set (accepts, NEWLINE_CHAR);
3549               else
3550                 continue;
3551             }
3552           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3553             {
3554               bitset_empty (accepts);
3555               continue;
3556             }
3557
3558           if (constraint & NEXT_WORD_CONSTRAINT)
3559             {
3560               unsigned int any_set = 0;
3561               if (type == CHARACTER && !node->word_char)
3562                 {
3563                   bitset_empty (accepts);
3564                   continue;
3565                 }
3566 #ifdef RE_ENABLE_I18N
3567               if (dfa->mb_cur_max > 1)
3568                 for (j = 0; j < BITSET_UINTS; ++j)
3569                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3570               else
3571 #endif
3572                 for (j = 0; j < BITSET_UINTS; ++j)
3573                   any_set |= (accepts[j] &= dfa->word_char[j]);
3574               if (!any_set)
3575                 continue;
3576             }
3577           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3578             {
3579               unsigned int any_set = 0;
3580               if (type == CHARACTER && node->word_char)
3581                 {
3582                   bitset_empty (accepts);
3583                   continue;
3584                 }
3585 #ifdef RE_ENABLE_I18N
3586               if (dfa->mb_cur_max > 1)
3587                 for (j = 0; j < BITSET_UINTS; ++j)
3588                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3589               else
3590 #endif
3591                 for (j = 0; j < BITSET_UINTS; ++j)
3592                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3593               if (!any_set)
3594                 continue;
3595             }
3596         }
3597
3598       /* Then divide `accepts' into DFA states, or create a new
3599          state.  Above, we make sure that accepts is not empty.  */
3600       for (j = 0; j < ndests; ++j)
3601         {
3602           bitset intersec; /* Intersection sets, see below.  */
3603           bitset remains;
3604           /* Flags, see below.  */
3605           int has_intersec, not_subset, not_consumed;
3606
3607           /* Optimization, skip if this state doesn't accept the character.  */
3608           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3609             continue;
3610
3611           /* Enumerate the intersection set of this state and `accepts'.  */
3612           has_intersec = 0;
3613           for (k = 0; k < BITSET_UINTS; ++k)
3614             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3615           /* And skip if the intersection set is empty.  */
3616           if (!has_intersec)
3617             continue;
3618
3619           /* Then check if this state is a subset of `accepts'.  */
3620           not_subset = not_consumed = 0;
3621           for (k = 0; k < BITSET_UINTS; ++k)
3622             {
3623               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3624               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3625             }
3626
3627           /* If this state isn't a subset of `accepts', create a
3628              new group state, which has the `remains'. */
3629           if (not_subset)
3630             {
3631               bitset_copy (dests_ch[ndests], remains);
3632               bitset_copy (dests_ch[j], intersec);
3633               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3634               if (BE (err != REG_NOERROR, 0))
3635                 goto error_return;
3636               ++ndests;
3637             }
3638
3639           /* Put the position in the current group. */
3640           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3641           if (BE (result < 0, 0))
3642             goto error_return;
3643
3644           /* If all characters are consumed, go to next node. */
3645           if (!not_consumed)
3646             break;
3647         }
3648       /* Some characters remain, create a new group. */
3649       if (j == ndests)
3650         {
3651           bitset_copy (dests_ch[ndests], accepts);
3652           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3653           if (BE (err != REG_NOERROR, 0))
3654             goto error_return;
3655           ++ndests;
3656           bitset_empty (accepts);
3657         }
3658     }
3659   return ndests;
3660  error_return:
3661   for (j = 0; j < ndests; ++j)
3662     re_node_set_free (dests_node + j);
3663   return REG_MISSING;
3664 }
3665
3666 #ifdef RE_ENABLE_I18N
3667 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3668    Return the number of the bytes the node accepts.
3669    STR_IDX is the current index of the input string.
3670
3671    This function handles the nodes which can accept one character, or
3672    one collating element like '.', '[a-z]', opposite to the other nodes
3673    can only accept one byte.  */
3674
3675 static int
3676 internal_function
3677 check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3678                          const re_string_t *input, Idx str_idx)
3679 {
3680   const re_token_t *node = dfa->nodes + node_idx;
3681   int char_len, elem_len;
3682   Idx i;
3683
3684   if (BE (node->type == OP_UTF8_PERIOD, 0))
3685     {
3686       unsigned char c = re_string_byte_at (input, str_idx), d;
3687       if (BE (c < 0xc2, 1))
3688         return 0;
3689
3690       if (str_idx + 2 > input->len)
3691         return 0;
3692
3693       d = re_string_byte_at (input, str_idx + 1);
3694       if (c < 0xe0)
3695         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3696       else if (c < 0xf0)
3697         {
3698           char_len = 3;
3699           if (c == 0xe0 && d < 0xa0)
3700             return 0;
3701         }
3702       else if (c < 0xf8)
3703         {
3704           char_len = 4;
3705           if (c == 0xf0 && d < 0x90)
3706             return 0;
3707         }
3708       else if (c < 0xfc)
3709         {
3710           char_len = 5;
3711           if (c == 0xf8 && d < 0x88)
3712             return 0;
3713         }
3714       else if (c < 0xfe)
3715         {
3716           char_len = 6;
3717           if (c == 0xfc && d < 0x84)
3718             return 0;
3719         }
3720       else
3721         return 0;
3722
3723       if (str_idx + char_len > input->len)
3724         return 0;
3725
3726       for (i = 1; i < char_len; ++i)
3727         {
3728           d = re_string_byte_at (input, str_idx + i);
3729           if (d < 0x80 || d > 0xbf)
3730             return 0;
3731         }
3732       return char_len;
3733     }
3734
3735   char_len = re_string_char_size_at (input, str_idx);
3736   if (node->type == OP_PERIOD)
3737     {
3738       if (char_len <= 1)
3739         return 0;
3740       /* FIXME: I don't think this if is needed, as both '\n'
3741          and '\0' are char_len == 1.  */
3742       /* '.' accepts any one character except the following two cases.  */
3743       if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3744            re_string_byte_at (input, str_idx) == '\n') ||
3745           ((dfa->syntax & REG_DOT_NOT_NULL) &&
3746            re_string_byte_at (input, str_idx) == '\0'))
3747         return 0;
3748       return char_len;
3749     }
3750
3751   elem_len = re_string_elem_size_at (input, str_idx);
3752   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3753     return 0;
3754
3755   if (node->type == COMPLEX_BRACKET)
3756     {
3757       const re_charset_t *cset = node->opr.mbcset;
3758 # ifdef _LIBC
3759       const unsigned char *pin
3760         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3761       Idx j;
3762       uint32_t nrules;
3763 # endif /* _LIBC */
3764       int match_len = 0;
3765       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3766                     ? re_string_wchar_at (input, str_idx) : 0);
3767
3768       /* match with multibyte character?  */
3769       for (i = 0; i < cset->nmbchars; ++i)
3770         if (wc == cset->mbchars[i])
3771           {
3772             match_len = char_len;
3773             goto check_node_accept_bytes_match;
3774           }
3775       /* match with character_class?  */
3776       for (i = 0; i < cset->nchar_classes; ++i)
3777         {
3778           wctype_t wt = cset->char_classes[i];
3779           if (__iswctype (wc, wt))
3780             {
3781               match_len = char_len;
3782               goto check_node_accept_bytes_match;
3783             }
3784         }
3785
3786 # ifdef _LIBC
3787       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3788       if (nrules != 0)
3789         {
3790           unsigned int in_collseq = 0;
3791           const int32_t *table, *indirect;
3792           const unsigned char *weights, *extra;
3793           const char *collseqwc;
3794           int32_t idx;
3795           /* This #include defines a local function!  */
3796 #  include <locale/weight.h>
3797
3798           /* match with collating_symbol?  */
3799           if (cset->ncoll_syms)
3800             extra = (const unsigned char *)
3801               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3802           for (i = 0; i < cset->ncoll_syms; ++i)
3803             {
3804               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3805               /* Compare the length of input collating element and
3806                  the length of current collating element.  */
3807               if (*coll_sym != elem_len)
3808                 continue;
3809               /* Compare each bytes.  */
3810               for (j = 0; j < *coll_sym; j++)
3811                 if (pin[j] != coll_sym[1 + j])
3812                   break;
3813               if (j == *coll_sym)
3814                 {
3815                   /* Match if every bytes is equal.  */
3816                   match_len = j;
3817                   goto check_node_accept_bytes_match;
3818                 }
3819             }
3820
3821           if (cset->nranges)
3822             {
3823               if (elem_len <= char_len)
3824                 {
3825                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3826                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3827                 }
3828               else
3829                 in_collseq = find_collation_sequence_value (pin, elem_len);
3830             }
3831           /* match with range expression?  */
3832           for (i = 0; i < cset->nranges; ++i)
3833             if (cset->range_starts[i] <= in_collseq
3834                 && in_collseq <= cset->range_ends[i])
3835               {
3836                 match_len = elem_len;
3837                 goto check_node_accept_bytes_match;
3838               }
3839
3840           /* match with equivalence_class?  */
3841           if (cset->nequiv_classes)
3842             {
3843               const unsigned char *cp = pin;
3844               table = (const int32_t *)
3845                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3846               weights = (const unsigned char *)
3847                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3848               extra = (const unsigned char *)
3849                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3850               indirect = (const int32_t *)
3851                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3852               idx = findidx (&cp);
3853               if (idx > 0)
3854                 for (i = 0; i < cset->nequiv_classes; ++i)
3855                   {
3856                     int32_t equiv_class_idx = cset->equiv_classes[i];
3857                     size_t weight_len = weights[idx];
3858                     if (weight_len == weights[equiv_class_idx])
3859                       {
3860                         Idx cnt = 0;
3861                         while (cnt <= weight_len
3862                                && (weights[equiv_class_idx + 1 + cnt]
3863                                    == weights[idx + 1 + cnt]))
3864                           ++cnt;
3865                         if (cnt > weight_len)
3866                           {
3867                             match_len = elem_len;
3868                             goto check_node_accept_bytes_match;
3869                           }
3870                       }
3871                   }
3872             }
3873         }
3874       else
3875 # endif /* _LIBC */
3876         {
3877           /* match with range expression?  */
3878 #if __GNUC__ >= 2
3879           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3880 #else
3881           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3882           cmp_buf[2] = wc;
3883 #endif
3884           for (i = 0; i < cset->nranges; ++i)
3885             {
3886               cmp_buf[0] = cset->range_starts[i];
3887               cmp_buf[4] = cset->range_ends[i];
3888               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3889                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3890                 {
3891                   match_len = char_len;
3892                   goto check_node_accept_bytes_match;
3893                 }
3894             }
3895         }
3896     check_node_accept_bytes_match:
3897       if (!cset->non_match)
3898         return match_len;
3899       else
3900         {
3901           if (match_len > 0)
3902             return 0;
3903           else
3904             return (elem_len > char_len) ? elem_len : char_len;
3905         }
3906     }
3907   return 0;
3908 }
3909
3910 # ifdef _LIBC
3911 static unsigned int
3912 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3913 {
3914   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3915   if (nrules == 0)
3916     {
3917       if (mbs_len == 1)
3918         {
3919           /* No valid character.  Match it as a single byte character.  */
3920           const unsigned char *collseq = (const unsigned char *)
3921             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3922           return collseq[mbs[0]];
3923         }
3924       return UINT_MAX;
3925     }
3926   else
3927     {
3928       int32_t idx;
3929       const unsigned char *extra = (const unsigned char *)
3930         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3931       int32_t extrasize = (const unsigned char *)
3932         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3933
3934       for (idx = 0; idx < extrasize;)
3935         {
3936           int mbs_cnt, found = 0;
3937           int32_t elem_mbs_len;
3938           /* Skip the name of collating element name.  */
3939           idx = idx + extra[idx] + 1;
3940           elem_mbs_len = extra[idx++];
3941           if (mbs_len == elem_mbs_len)
3942             {
3943               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3944                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3945                   break;
3946               if (mbs_cnt == elem_mbs_len)
3947                 /* Found the entry.  */
3948                 found = 1;
3949             }
3950           /* Skip the byte sequence of the collating element.  */
3951           idx += elem_mbs_len;
3952           /* Adjust for the alignment.  */
3953           idx = (idx + 3) & ~3;
3954           /* Skip the collation sequence value.  */
3955           idx += sizeof (uint32_t);
3956           /* Skip the wide char sequence of the collating element.  */
3957           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3958           /* If we found the entry, return the sequence value.  */
3959           if (found)
3960             return *(uint32_t *) (extra + idx);
3961           /* Skip the collation sequence value.  */
3962           idx += sizeof (uint32_t);
3963         }
3964       return UINT_MAX;
3965     }
3966 }
3967 # endif /* _LIBC */
3968 #endif /* RE_ENABLE_I18N */
3969
3970 /* Check whether the node accepts the byte which is IDX-th
3971    byte of the INPUT.  */
3972
3973 static int
3974 internal_function
3975 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3976                    Idx idx)
3977 {
3978   unsigned char ch;
3979   ch = re_string_byte_at (&mctx->input, idx);
3980   switch (node->type)
3981     {
3982     case CHARACTER:
3983       if (node->opr.c != ch)
3984         return 0;
3985       break;
3986
3987     case SIMPLE_BRACKET:
3988       if (!bitset_contain (node->opr.sbcset, ch))
3989         return 0;
3990       break;
3991
3992 #ifdef RE_ENABLE_I18N
3993     case OP_UTF8_PERIOD:
3994       if (ch >= 0x80)
3995         return 0;
3996       /* FALLTHROUGH */
3997 #endif
3998     case OP_PERIOD:
3999       if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4000           || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4001         return 0;
4002       break;
4003
4004     default:
4005       return 0;
4006     }
4007
4008   if (node->constraint)
4009     {
4010       /* The node has constraints.  Check whether the current context
4011          satisfies the constraints.  */
4012       unsigned int context = re_string_context_at (&mctx->input, idx,
4013                                                    mctx->eflags);
4014       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4015         return 0;
4016     }
4017
4018   return 1;
4019 }
4020
4021 /* Extend the buffers, if the buffers have run out.  */
4022
4023 static reg_errcode_t
4024 internal_function
4025 extend_buffers (re_match_context_t *mctx)
4026 {
4027   reg_errcode_t ret;
4028   re_string_t *pstr = &mctx->input;
4029
4030   /* Double the lengthes of the buffers.  */
4031   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4032   if (BE (ret != REG_NOERROR, 0))
4033     return ret;
4034
4035   if (mctx->state_log != NULL)
4036     {
4037       /* And double the length of state_log.  */
4038       /* XXX We have no indication of the size of this buffer.  If this
4039          allocation fail we have no indication that the state_log array
4040          does not have the right size.  */
4041       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4042                                               pstr->bufs_len + 1);
4043       if (BE (new_array == NULL, 0))
4044         return REG_ESPACE;
4045       mctx->state_log = new_array;
4046     }
4047
4048   /* Then reconstruct the buffers.  */
4049   if (pstr->icase)
4050     {
4051 #ifdef RE_ENABLE_I18N
4052       if (pstr->mb_cur_max > 1)
4053         {
4054           ret = build_wcs_upper_buffer (pstr);
4055           if (BE (ret != REG_NOERROR, 0))
4056             return ret;
4057         }
4058       else
4059 #endif /* RE_ENABLE_I18N  */
4060         build_upper_buffer (pstr);
4061     }
4062   else
4063     {
4064 #ifdef RE_ENABLE_I18N
4065       if (pstr->mb_cur_max > 1)
4066         build_wcs_buffer (pstr);
4067       else
4068 #endif /* RE_ENABLE_I18N  */
4069         {
4070           if (pstr->trans != NULL)
4071             re_string_translate_buffer (pstr);
4072         }
4073     }
4074   return REG_NOERROR;
4075 }
4076
4077 \f
4078 /* Functions for matching context.  */
4079
4080 /* Initialize MCTX.  */
4081
4082 static reg_errcode_t
4083 internal_function
4084 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4085 {
4086   mctx->eflags = eflags;
4087   mctx->match_last = REG_MISSING;
4088   if (n > 0)
4089     {
4090       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4091       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4092       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4093         return REG_ESPACE;
4094     }
4095   /* Already zero-ed by the caller.
4096      else
4097        mctx->bkref_ents = NULL;
4098      mctx->nbkref_ents = 0;
4099      mctx->nsub_tops = 0;  */
4100   mctx->abkref_ents = n;
4101   mctx->max_mb_elem_len = 1;
4102   mctx->asub_tops = n;
4103   return REG_NOERROR;
4104 }
4105
4106 /* Clean the entries which depend on the current input in MCTX.
4107    This function must be invoked when the matcher changes the start index
4108    of the input, or changes the input string.  */
4109
4110 static void
4111 internal_function
4112 match_ctx_clean (re_match_context_t *mctx)
4113 {
4114   Idx st_idx;
4115   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4116     {
4117       Idx sl_idx;
4118       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4119       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4120         {
4121           re_sub_match_last_t *last = top->lasts[sl_idx];
4122           re_free (last->path.array);
4123           re_free (last);
4124         }
4125       re_free (top->lasts);
4126       if (top->path)
4127         {
4128           re_free (top->path->array);
4129           re_free (top->path);
4130         }
4131       free (top);
4132     }
4133
4134   mctx->nsub_tops = 0;
4135   mctx->nbkref_ents = 0;
4136 }
4137
4138 /* Free all the memory associated with MCTX.  */
4139
4140 static void
4141 internal_function
4142 match_ctx_free (re_match_context_t *mctx)
4143 {
4144   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4145   match_ctx_clean (mctx);
4146   re_free (mctx->sub_tops);
4147   re_free (mctx->bkref_ents);
4148 }
4149
4150 /* Add a new backreference entry to MCTX.
4151    Note that we assume that caller never call this function with duplicate
4152    entry, and call with STR_IDX which isn't smaller than any existing entry.
4153 */
4154
4155 static reg_errcode_t
4156 internal_function
4157 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4158                      Idx from, Idx to)
4159 {
4160   if (mctx->nbkref_ents >= mctx->abkref_ents)
4161     {
4162       struct re_backref_cache_entry* new_entry;
4163       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4164                               mctx->abkref_ents * 2);
4165       if (BE (new_entry == NULL, 0))
4166         {
4167           re_free (mctx->bkref_ents);
4168           return REG_ESPACE;
4169         }
4170       mctx->bkref_ents = new_entry;
4171       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4172               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4173       mctx->abkref_ents *= 2;
4174     }
4175   if (mctx->nbkref_ents > 0
4176       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4177     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4178
4179   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4180   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4181   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4182   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4183
4184   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4185      If bit N is clear, means that this entry won't epsilon-transition to
4186      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4187      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4188      such node.
4189
4190      A backreference does not epsilon-transition unless it is empty, so set
4191      to all zeros if FROM != TO.  */
4192   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4193     = (from == to ? -1 : 0);
4194
4195   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4196   if (mctx->max_mb_elem_len < to - from)
4197     mctx->max_mb_elem_len = to - from;
4198   return REG_NOERROR;
4199 }
4200
4201 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4202    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4203
4204 static Idx
4205 internal_function
4206 search_cur_bkref_entry (re_match_context_t *mctx, Idx str_idx)
4207 {
4208   Idx left, right, mid, last;
4209   last = right = mctx->nbkref_ents;
4210   for (left = 0; left < right;)
4211     {
4212       mid = (left + right) / 2;
4213       if (mctx->bkref_ents[mid].str_idx < str_idx)
4214         left = mid + 1;
4215       else
4216         right = mid;
4217     }
4218   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4219     return left;
4220   else
4221     return REG_MISSING;
4222 }
4223
4224 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4225    at STR_IDX.  */
4226
4227 static reg_errcode_t
4228 internal_function
4229 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4230 {
4231 #ifdef DEBUG
4232   assert (mctx->sub_tops != NULL);
4233   assert (mctx->asub_tops > 0);
4234 #endif
4235   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4236     {
4237       Idx new_asub_tops = mctx->asub_tops * 2;
4238       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4239                                                    re_sub_match_top_t *,
4240                                                    new_asub_tops);
4241       if (BE (new_array == NULL, 0))
4242         return REG_ESPACE;
4243       mctx->sub_tops = new_array;
4244       mctx->asub_tops = new_asub_tops;
4245     }
4246   mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4247   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4248     return REG_ESPACE;
4249   mctx->sub_tops[mctx->nsub_tops]->node = node;
4250   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4251   return REG_NOERROR;
4252 }
4253
4254 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4255    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4256
4257 static re_sub_match_last_t *
4258 internal_function
4259 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4260 {
4261   re_sub_match_last_t *new_entry;
4262   if (BE (subtop->nlasts == subtop->alasts, 0))
4263     {
4264       Idx new_alasts = 2 * subtop->alasts + 1;
4265       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4266                                                     re_sub_match_last_t *,
4267                                                     new_alasts);
4268       if (BE (new_array == NULL, 0))
4269         return NULL;
4270       subtop->lasts = new_array;
4271       subtop->alasts = new_alasts;
4272     }
4273   new_entry = re_calloc (re_sub_match_last_t, 1);
4274   if (BE (new_entry != NULL, 1))
4275     {
4276       subtop->lasts[subtop->nlasts] = new_entry;
4277       new_entry->node = node;
4278       new_entry->str_idx = str_idx;
4279       ++subtop->nlasts;
4280     }
4281   return new_entry;
4282 }
4283
4284 static void
4285 internal_function
4286 sift_ctx_init (re_sift_context_t *sctx,
4287                re_dfastate_t **sifted_sts,
4288                re_dfastate_t **limited_sts,
4289                Idx last_node, Idx last_str_idx)
4290 {
4291   sctx->sifted_states = sifted_sts;
4292   sctx->limited_states = limited_sts;
4293   sctx->last_node = last_node;
4294   sctx->last_str_idx = last_str_idx;
4295   re_node_set_init_empty (&sctx->limits);
4296 }