lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 #ifndef __GNUC_PREREQ
  21 # if defined __GNUC__ && defined __GNUC_MINOR__
  22 #  define __GNUC_PREREQ(maj, min) \
  23         ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
  24 # else
  25 #  define __GNUC_PREREQ(maj, min) 0
  26 # endif
  27 #endif
  28
  29 #if !__GNUC_PREREQ (3, 0)
  30 # define pure
  31 #endif
  32
  33 static void re_string_construct_common (const char *str, Idx len,
  34                                         re_string_t *pstr,
  35                                         REG_TRANSLATE_TYPE trans, bool icase,
  36                                         const re_dfa_t *dfa) internal_function;
  37 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  38                                           const re_node_set *nodes,
  39                                           re_hashval_t hash) internal_function;
  40 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  41                                           const re_node_set *nodes,
  42                                           unsigned int context,
  43                                           re_hashval_t hash) internal_function;
  44 \f
  45 /* Functions for string operation.  */
  46
  47 /* This function allocate the buffers.  It is necessary to call
  48    re_string_reconstruct before using the object.  */
  49
  50 static reg_errcode_t
  51 internal_function
  52 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  53                     REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  54 {
  55   reg_errcode_t ret;
  56   Idx init_buf_len;
  57
  58   /* Ensure at least one character fits into the buffers.  */
  59   if (init_len < dfa->mb_cur_max)
  60     init_len = dfa->mb_cur_max;
  61   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  62   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  63
  64   ret = re_string_realloc_buffers (pstr, init_buf_len);
  65   if (BE (ret != REG_NOERROR, 0))
  66     return ret;
  67
  68   pstr->word_char = dfa->word_char;
  69   pstr->word_ops_used = dfa->word_ops_used;
  70   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  71   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  72   pstr->valid_raw_len = pstr->valid_len;
  73   return REG_NOERROR;
  74 }
  75
  76 /* This function allocate the buffers, and initialize them.  */
  77
  78 static reg_errcode_t
  79 internal_function
  80 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  81                      REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  82 {
  83   reg_errcode_t ret;
  84   memset (pstr, '\0', sizeof (re_string_t));
  85   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  86
  87   if (len > 0)
  88     {
  89       ret = re_string_realloc_buffers (pstr, len + 1);
  90       if (BE (ret != REG_NOERROR, 0))
  91         return ret;
  92     }
  93   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  94
  95   if (icase)
  96     {
  97 #ifdef RE_ENABLE_I18N
  98       if (dfa->mb_cur_max > 1)
  99         {
 100           while (1)
 101             {
 102               ret = build_wcs_upper_buffer (pstr);
 103               if (BE (ret != REG_NOERROR, 0))
 104                 return ret;
 105               if (pstr->valid_raw_len >= len)
 106                 break;
 107               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 108                 break;
 109               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 110               if (BE (ret != REG_NOERROR, 0))
 111                 return ret;
 112             }
 113         }
 114       else
 115 #endif /* RE_ENABLE_I18N  */
 116         build_upper_buffer (pstr);
 117     }
 118   else
 119     {
 120 #ifdef RE_ENABLE_I18N
 121       if (dfa->mb_cur_max > 1)
 122         build_wcs_buffer (pstr);
 123       else
 124 #endif /* RE_ENABLE_I18N  */
 125         {
 126           if (trans != NULL)
 127             re_string_translate_buffer (pstr);
 128           else
 129             {
 130               pstr->valid_len = pstr->bufs_len;
 131               pstr->valid_raw_len = pstr->bufs_len;
 132             }
 133         }
 134     }
 135
 136   return REG_NOERROR;
 137 }
 138
 139 /* Helper functions for re_string_allocate, and re_string_construct.  */
 140
 141 static reg_errcode_t
 142 internal_function
 143 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 144 {
 145 #ifdef RE_ENABLE_I18N
 146   if (pstr->mb_cur_max > 1)
 147     {
 148       wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
 149       if (BE (new_wcs == NULL, 0))
 150         return REG_ESPACE;
 151       pstr->wcs = new_wcs;
 152       if (pstr->offsets != NULL)
 153         {
 154           Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
 155           if (BE (new_offsets == NULL, 0))
 156             return REG_ESPACE;
 157           pstr->offsets = new_offsets;
 158         }
 159     }
 160 #endif /* RE_ENABLE_I18N  */
 161   if (pstr->mbs_allocated)
 162     {
 163       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 164                                            new_buf_len);
 165       if (BE (new_mbs == NULL, 0))
 166         return REG_ESPACE;
 167       pstr->mbs = new_mbs;
 168     }
 169   pstr->bufs_len = new_buf_len;
 170   return REG_NOERROR;
 171 }
 172
 173
 174 static void
 175 internal_function
 176 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 177                             REG_TRANSLATE_TYPE trans, bool icase,
 178                             const re_dfa_t *dfa)
 179 {
 180   pstr->raw_mbs = (const unsigned char *) str;
 181   pstr->len = len;
 182   pstr->raw_len = len;
 183   pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
 184   pstr->icase = icase;
 185   pstr->mbs_allocated = (trans != NULL || icase);
 186   pstr->mb_cur_max = dfa->mb_cur_max;
 187   pstr->is_utf8 = dfa->is_utf8;
 188   pstr->map_notascii = dfa->map_notascii;
 189   pstr->stop = pstr->len;
 190   pstr->raw_stop = pstr->stop;
 191 }
 192
 193 #ifdef RE_ENABLE_I18N
 194
 195 /* Build wide character buffer PSTR->WCS.
 196    If the byte sequence of the string are:
 197      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 198    Then wide character buffer will be:
 199      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 200    We use WEOF for padding, they indicate that the position isn't
 201    a first byte of a multibyte character.
 202
 203    Note that this function assumes PSTR->VALID_LEN elements are already
 204    built and starts from PSTR->VALID_LEN.  */
 205
 206 static void
 207 internal_function
 208 build_wcs_buffer (re_string_t *pstr)
 209 {
 210 #ifdef _LIBC
 211   unsigned char buf[MB_LEN_MAX];
 212   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 213 #else
 214   unsigned char buf[64];
 215 #endif
 216   mbstate_t prev_st;
 217   Idx byte_idx, end_idx, remain_len;
 218   size_t mbclen;
 219
 220   /* Build the buffers from pstr->valid_len to either pstr->len or
 221      pstr->bufs_len.  */
 222   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 223   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 224     {
 225       wchar_t wc;
 226       const char *p;
 227
 228       remain_len = end_idx - byte_idx;
 229       prev_st = pstr->cur_state;
 230       /* Apply the translation if we need.  */
 231       if (BE (pstr->trans != NULL, 0))
 232         {
 233           int i, ch;
 234
 235           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 236             {
 237               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 238               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 239             }
 240           p = (const char *) buf;
 241         }
 242       else
 243         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 244       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 245       if (BE (mbclen == (size_t) -2, 0))
 246         {
 247           /* The buffer doesn't have enough space, finish to build.  */
 248           pstr->cur_state = prev_st;
 249           break;
 250         }
 251       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 252         {
 253           /* We treat these cases as a singlebyte character.  */
 254           mbclen = 1;
 255           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 256           if (BE (pstr->trans != NULL, 0))
 257             wc = pstr->trans[wc];
 258           pstr->cur_state = prev_st;
 259         }
 260
 261       /* Write wide character and padding.  */
 262       pstr->wcs[byte_idx++] = wc;
 263       /* Write paddings.  */
 264       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 265         pstr->wcs[byte_idx++] = WEOF;
 266     }
 267   pstr->valid_len = byte_idx;
 268   pstr->valid_raw_len = byte_idx;
 269 }
 270
 271 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 272    but for REG_ICASE.  */
 273
 274 static reg_errcode_t
 275 internal_function
 276 build_wcs_upper_buffer (re_string_t *pstr)
 277 {
 278   mbstate_t prev_st;
 279   Idx src_idx, byte_idx, end_idx, remain_len;
 280   size_t mbclen;
 281 #ifdef _LIBC
 282   char buf[MB_LEN_MAX];
 283   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 284 #else
 285   char buf[64];
 286 #endif
 287
 288   byte_idx = pstr->valid_len;
 289   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 290
 291   /* The following optimization assumes that ASCII characters can be
 292      mapped to wide characters with a simple cast.  */
 293   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 294     {
 295       while (byte_idx < end_idx)
 296         {
 297           wchar_t wc;
 298
 299           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 300               && mbsinit (&pstr->cur_state))
 301             {
 302               /* In case of a singlebyte character.  */
 303               pstr->mbs[byte_idx]
 304                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 305               /* The next step uses the assumption that wchar_t is encoded
 306                  ASCII-safe: all ASCII values can be converted like this.  */
 307               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 308               ++byte_idx;
 309               continue;
 310             }
 311
 312           remain_len = end_idx - byte_idx;
 313           prev_st = pstr->cur_state;
 314           mbclen = mbrtowc (&wc,
 315                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 316                              + byte_idx), remain_len, &pstr->cur_state);
 317           if (BE ((size_t) (mbclen + 2) > 2, 1))
 318             {
 319               wchar_t wcu = wc;
 320               if (iswlower (wc))
 321                 {
 322                   size_t mbcdlen;
 323
 324                   wcu = towupper (wc);
 325                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 326                   if (BE (mbclen == mbcdlen, 1))
 327                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 328                   else
 329                     {
 330                       src_idx = byte_idx;
 331                       goto offsets_needed;
 332                     }
 333                 }
 334               else
 335                 memcpy (pstr->mbs + byte_idx,
 336                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 337               pstr->wcs[byte_idx++] = wcu;
 338               /* Write paddings.  */
 339               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 340                 pstr->wcs[byte_idx++] = WEOF;
 341             }
 342           else if (mbclen == (size_t) -1 || mbclen == 0)
 343             {
 344               /* It is an invalid character or '\0'.  Just use the byte.  */
 345               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 346               pstr->mbs[byte_idx] = ch;
 347               /* And also cast it to wide char.  */
 348               pstr->wcs[byte_idx++] = (wchar_t) ch;
 349               if (BE (mbclen == (size_t) -1, 0))
 350                 pstr->cur_state = prev_st;
 351             }
 352           else
 353             {
 354               /* The buffer doesn't have enough space, finish to build.  */
 355               pstr->cur_state = prev_st;
 356               break;
 357             }
 358         }
 359       pstr->valid_len = byte_idx;
 360       pstr->valid_raw_len = byte_idx;
 361       return REG_NOERROR;
 362     }
 363   else
 364     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 365       {
 366         wchar_t wc;
 367         const char *p;
 368       offsets_needed:
 369         remain_len = end_idx - byte_idx;
 370         prev_st = pstr->cur_state;
 371         if (BE (pstr->trans != NULL, 0))
 372           {
 373             int i, ch;
 374
 375             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 376               {
 377                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 378                 buf[i] = pstr->trans[ch];
 379               }
 380             p = (const char *) buf;
 381           }
 382         else
 383           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 384         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 385         if (BE ((size_t) (mbclen + 2) > 2, 1))
 386           {
 387             wchar_t wcu = wc;
 388             if (iswlower (wc))
 389               {
 390                 size_t mbcdlen;
 391
 392                 wcu = towupper (wc);
 393                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 394                 if (BE (mbclen == mbcdlen, 1))
 395                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 396                 else if (mbcdlen != (size_t) -1)
 397                   {
 398                     size_t i;
 399
 400                     if (byte_idx + mbcdlen > pstr->bufs_len)
 401                       {
 402                         pstr->cur_state = prev_st;
 403                         break;
 404                       }
 405
 406                     if (pstr->offsets == NULL)
 407                       {
 408                         pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
 409
 410                         if (pstr->offsets == NULL)
 411                           return REG_ESPACE;
 412                       }
 413                     if (!pstr->offsets_needed)
 414                       {
 415                         for (i = 0; i < (size_t) byte_idx; ++i)
 416                           pstr->offsets[i] = i;
 417                         pstr->offsets_needed = 1;
 418                       }
 419
 420                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 421                     pstr->wcs[byte_idx] = wcu;
 422                     pstr->offsets[byte_idx] = src_idx;
 423                     for (i = 1; i < mbcdlen; ++i)
 424                       {
 425                         pstr->offsets[byte_idx + i]
 426                           = src_idx + (i < mbclen ? i : mbclen - 1);
 427                         pstr->wcs[byte_idx + i] = WEOF;
 428                       }
 429                     pstr->len += mbcdlen - mbclen;
 430                     if (pstr->raw_stop > src_idx)
 431                       pstr->stop += mbcdlen - mbclen;
 432                     end_idx = (pstr->bufs_len > pstr->len)
 433                               ? pstr->len : pstr->bufs_len;
 434                     byte_idx += mbcdlen;
 435                     src_idx += mbclen;
 436                     continue;
 437                   }
 438                 else
 439                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 440               }
 441             else
 442               memcpy (pstr->mbs + byte_idx, p, mbclen);
 443
 444             if (BE (pstr->offsets_needed != 0, 0))
 445               {
 446                 size_t i;
 447                 for (i = 0; i < mbclen; ++i)
 448                   pstr->offsets[byte_idx + i] = src_idx + i;
 449               }
 450             src_idx += mbclen;
 451
 452             pstr->wcs[byte_idx++] = wcu;
 453             /* Write paddings.  */
 454             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 455               pstr->wcs[byte_idx++] = WEOF;
 456           }
 457         else if (mbclen == (size_t) -1 || mbclen == 0)
 458           {
 459             /* It is an invalid character or '\0'.  Just use the byte.  */
 460             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 461
 462             if (BE (pstr->trans != NULL, 0))
 463               ch = pstr->trans [ch];
 464             pstr->mbs[byte_idx] = ch;
 465
 466             if (BE (pstr->offsets_needed != 0, 0))
 467               pstr->offsets[byte_idx] = src_idx;
 468             ++src_idx;
 469
 470             /* And also cast it to wide char.  */
 471             pstr->wcs[byte_idx++] = (wchar_t) ch;
 472             if (BE (mbclen == (size_t) -1, 0))
 473               pstr->cur_state = prev_st;
 474           }
 475         else
 476           {
 477             /* The buffer doesn't have enough space, finish to build.  */
 478             pstr->cur_state = prev_st;
 479             break;
 480           }
 481       }
 482   pstr->valid_len = byte_idx;
 483   pstr->valid_raw_len = src_idx;
 484   return REG_NOERROR;
 485 }
 486
 487 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 488    Return the index.  */
 489
 490 static Idx
 491 internal_function
 492 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 493 {
 494   mbstate_t prev_st;
 495   Idx rawbuf_idx;
 496   size_t mbclen;
 497   wchar_t wc = 0;
 498
 499   /* Skip the characters which are not necessary to check.  */
 500   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 501        rawbuf_idx < new_raw_idx;)
 502     {
 503       Idx remain_len;
 504       remain_len = pstr->len - rawbuf_idx;
 505       prev_st = pstr->cur_state;
 506       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 507                         remain_len, &pstr->cur_state);
 508       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 509         {
 510           /* We treat these cases as a singlebyte character.  */
 511           mbclen = 1;
 512           pstr->cur_state = prev_st;
 513         }
 514       /* Then proceed the next character.  */
 515       rawbuf_idx += mbclen;
 516     }
 517   *last_wc = (wint_t) wc;
 518   return rawbuf_idx;
 519 }
 520 #endif /* RE_ENABLE_I18N  */
 521
 522 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 523    This function is used in case of REG_ICASE.  */
 524
 525 static void
 526 internal_function
 527 build_upper_buffer (re_string_t *pstr)
 528 {
 529   Idx char_idx, end_idx;
 530   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 531
 532   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 533     {
 534       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 535       if (BE (pstr->trans != NULL, 0))
 536         ch = pstr->trans[ch];
 537       if (islower (ch))
 538         pstr->mbs[char_idx] = toupper (ch);
 539       else
 540         pstr->mbs[char_idx] = ch;
 541     }
 542   pstr->valid_len = char_idx;
 543   pstr->valid_raw_len = char_idx;
 544 }
 545
 546 /* Apply TRANS to the buffer in PSTR.  */
 547
 548 static void
 549 internal_function
 550 re_string_translate_buffer (re_string_t *pstr)
 551 {
 552   Idx buf_idx, end_idx;
 553   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 554
 555   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 556     {
 557       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 558       pstr->mbs[buf_idx] = pstr->trans[ch];
 559     }
 560
 561   pstr->valid_len = buf_idx;
 562   pstr->valid_raw_len = buf_idx;
 563 }
 564
 565 /* This function re-construct the buffers.
 566    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 567    convert to upper case in case of REG_ICASE, apply translation.  */
 568
 569 static reg_errcode_t
 570 internal_function
 571 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 572 {
 573   Idx offset;
 574
 575   if (BE (pstr->raw_mbs_idx <= idx, 0))
 576     offset = idx - pstr->raw_mbs_idx;
 577   else
 578     {
 579       /* Reset buffer.  */
 580 #ifdef RE_ENABLE_I18N
 581       if (pstr->mb_cur_max > 1)
 582         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 583 #endif /* RE_ENABLE_I18N */
 584       pstr->len = pstr->raw_len;
 585       pstr->stop = pstr->raw_stop;
 586       pstr->valid_len = 0;
 587       pstr->raw_mbs_idx = 0;
 588       pstr->valid_raw_len = 0;
 589       pstr->offsets_needed = 0;
 590       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 591                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 592       if (!pstr->mbs_allocated)
 593         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 594       offset = idx;
 595     }
 596
 597   if (BE (offset != 0, 1))
 598     {
 599       /* Are the characters which are already checked remain?  */
 600       if (BE (offset < pstr->valid_raw_len, 1)
 601 #ifdef RE_ENABLE_I18N
 602           /* Handling this would enlarge the code too much.
 603              Accept a slowdown in that case.  */
 604           && pstr->offsets_needed == 0
 605 #endif
 606          )
 607         {
 608           /* Yes, move them to the front of the buffer.  */
 609           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 610 #ifdef RE_ENABLE_I18N
 611           if (pstr->mb_cur_max > 1)
 612             memmove (pstr->wcs, pstr->wcs + offset,
 613                      (pstr->valid_len - offset) * sizeof (wint_t));
 614 #endif /* RE_ENABLE_I18N */
 615           if (BE (pstr->mbs_allocated, 0))
 616             memmove (pstr->mbs, pstr->mbs + offset,
 617                      pstr->valid_len - offset);
 618           pstr->valid_len -= offset;
 619           pstr->valid_raw_len -= offset;
 620 #if DEBUG
 621           assert (pstr->valid_len > 0);
 622 #endif
 623         }
 624       else
 625         {
 626           /* No, skip all characters until IDX.  */
 627 #ifdef RE_ENABLE_I18N
 628           if (BE (pstr->offsets_needed, 0))
 629             {
 630               pstr->len = pstr->raw_len - idx + offset;
 631               pstr->stop = pstr->raw_stop - idx + offset;
 632               pstr->offsets_needed = 0;
 633             }
 634 #endif
 635           pstr->valid_len = 0;
 636           pstr->valid_raw_len = 0;
 637 #ifdef RE_ENABLE_I18N
 638           if (pstr->mb_cur_max > 1)
 639             {
 640               Idx wcs_idx;
 641               wint_t wc = WEOF;
 642
 643               if (pstr->is_utf8)
 644                 {
 645                   const unsigned char *raw, *p, *q, *end;
 646
 647                   /* Special case UTF-8.  Multi-byte chars start with any
 648                      byte other than 0x80 - 0xbf.  */
 649                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 650                   end = raw + (offset - pstr->mb_cur_max);
 651                   for (p = raw + offset - 1; p >= end; --p)
 652                     if ((*p & 0xc0) != 0x80)
 653                       {
 654                         mbstate_t cur_state;
 655                         wchar_t wc2;
 656                         Idx mlen = raw + pstr->len - p;
 657                         unsigned char buf[6];
 658                         size_t mbclen;
 659
 660                         q = p;
 661                         if (BE (pstr->trans != NULL, 0))
 662                           {
 663                             int i = mlen < 6 ? mlen : 6;
 664                             while (--i >= 0)
 665                               buf[i] = pstr->trans[p[i]];
 666                             q = buf;
 667                           }
 668                         /* XXX Don't use mbrtowc, we know which conversion
 669                            to use (UTF-8 -> UCS4).  */
 670                         memset (&cur_state, 0, sizeof (cur_state));
 671                         mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 672                                           &cur_state);
 673                         if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
 674                           {
 675                             memset (&pstr->cur_state, '\0',
 676                                     sizeof (mbstate_t));
 677                             pstr->valid_len = mbclen - (raw + offset - p);
 678                             wc = wc2;
 679                           }
 680                         break;
 681                       }
 682                 }
 683
 684               if (wc == WEOF)
 685                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 686               if (BE (pstr->valid_len, 0))
 687                 {
 688                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 689                     pstr->wcs[wcs_idx] = WEOF;
 690                   if (pstr->mbs_allocated)
 691                     memset (pstr->mbs, -1, pstr->valid_len);
 692                 }
 693               pstr->valid_raw_len = pstr->valid_len;
 694               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 695                                     && IS_WIDE_WORD_CHAR (wc))
 696                                    ? CONTEXT_WORD
 697                                    : ((IS_WIDE_NEWLINE (wc)
 698                                        && pstr->newline_anchor)
 699                                       ? CONTEXT_NEWLINE : 0));
 700             }
 701           else
 702 #endif /* RE_ENABLE_I18N */
 703             {
 704               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 705               if (pstr->trans)
 706                 c = pstr->trans[c];
 707               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 708                                    ? CONTEXT_WORD
 709                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 710                                       ? CONTEXT_NEWLINE : 0));
 711             }
 712         }
 713       if (!BE (pstr->mbs_allocated, 0))
 714         pstr->mbs += offset;
 715     }
 716   pstr->raw_mbs_idx = idx;
 717   pstr->len -= offset;
 718   pstr->stop -= offset;
 719
 720   /* Then build the buffers.  */
 721 #ifdef RE_ENABLE_I18N
 722   if (pstr->mb_cur_max > 1)
 723     {
 724       if (pstr->icase)
 725         {
 726           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 727           if (BE (ret != REG_NOERROR, 0))
 728             return ret;
 729         }
 730       else
 731         build_wcs_buffer (pstr);
 732     }
 733   else
 734 #endif /* RE_ENABLE_I18N */
 735   if (BE (pstr->mbs_allocated, 0))
 736     {
 737       if (pstr->icase)
 738         build_upper_buffer (pstr);
 739       else if (pstr->trans != NULL)
 740         re_string_translate_buffer (pstr);
 741     }
 742   else
 743     pstr->valid_len = pstr->len;
 744
 745   pstr->cur_idx = 0;
 746   return REG_NOERROR;
 747 }
 748
 749 static unsigned char
 750 internal_function __attribute ((pure))
 751 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 752 {
 753   int ch;
 754   Idx off;
 755
 756   /* Handle the common (easiest) cases first.  */
 757   if (BE (!pstr->mbs_allocated, 1))
 758     return re_string_peek_byte (pstr, idx);
 759
 760 #ifdef RE_ENABLE_I18N
 761   if (pstr->mb_cur_max > 1
 762       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 763     return re_string_peek_byte (pstr, idx);
 764 #endif
 765
 766   off = pstr->cur_idx + idx;
 767 #ifdef RE_ENABLE_I18N
 768   if (pstr->offsets_needed)
 769     off = pstr->offsets[off];
 770 #endif
 771
 772   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 773
 774 #ifdef RE_ENABLE_I18N
 775   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 776      this function returns CAPITAL LETTER I instead of first byte of
 777      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 778      since peek_byte_case doesn't advance cur_idx in any way.  */
 779   if (pstr->offsets_needed && !isascii (ch))
 780     return re_string_peek_byte (pstr, idx);
 781 #endif
 782
 783   return ch;
 784 }
 785
 786 static unsigned char
 787 internal_function __attribute ((pure))
 788 re_string_fetch_byte_case (re_string_t *pstr)
 789 {
 790   if (BE (!pstr->mbs_allocated, 1))
 791     return re_string_fetch_byte (pstr);
 792
 793 #ifdef RE_ENABLE_I18N
 794   if (pstr->offsets_needed)
 795     {
 796       Idx off;
 797       int ch;
 798
 799       /* For tr_TR.UTF-8 [[:islower:]] there is
 800          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 801          in that case the whole multi-byte character and return
 802          the original letter.  On the other side, with
 803          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 804          anything else would complicate things too much.  */
 805
 806       if (!re_string_first_byte (pstr, pstr->cur_idx))
 807         return re_string_fetch_byte (pstr);
 808
 809       off = pstr->offsets[pstr->cur_idx];
 810       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 811
 812       if (! isascii (ch))
 813         return re_string_fetch_byte (pstr);
 814
 815       re_string_skip_bytes (pstr,
 816                             re_string_char_size_at (pstr, pstr->cur_idx));
 817       return ch;
 818     }
 819 #endif
 820
 821   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 822 }
 823
 824 static void
 825 internal_function
 826 re_string_destruct (re_string_t *pstr)
 827 {
 828 #ifdef RE_ENABLE_I18N
 829   re_free (pstr->wcs);
 830   re_free (pstr->offsets);
 831 #endif /* RE_ENABLE_I18N  */
 832   if (pstr->mbs_allocated)
 833     re_free (pstr->mbs);
 834 }
 835
 836 /* Return the context at IDX in INPUT.  */
 837
 838 static unsigned int
 839 internal_function
 840 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 841 {
 842   int c;
 843   if (BE (! REG_VALID_INDEX (idx), 0))
 844     /* In this case, we use the value stored in input->tip_context,
 845        since we can't know the character in input->mbs[-1] here.  */
 846     return input->tip_context;
 847   if (BE (idx == input->len, 0))
 848     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 849             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 850 #ifdef RE_ENABLE_I18N
 851   if (input->mb_cur_max > 1)
 852     {
 853       wint_t wc;
 854       Idx wc_idx = idx;
 855       while(input->wcs[wc_idx] == WEOF)
 856         {
 857 #ifdef DEBUG
 858           /* It must not happen.  */
 859           assert (REG_VALID_INDEX (wc_idx));
 860 #endif
 861           --wc_idx;
 862           if (! REG_VALID_INDEX (wc_idx))
 863             return input->tip_context;
 864         }
 865       wc = input->wcs[wc_idx];
 866       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 867         return CONTEXT_WORD;
 868       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 869               ? CONTEXT_NEWLINE : 0);
 870     }
 871   else
 872 #endif
 873     {
 874       c = re_string_byte_at (input, idx);
 875       if (bitset_contain (input->word_char, c))
 876         return CONTEXT_WORD;
 877       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 878     }
 879 }
 880 \f
 881 /* Functions for set operation.  */
 882
 883 static reg_errcode_t
 884 internal_function
 885 re_node_set_alloc (re_node_set *set, Idx size)
 886 {
 887   set->alloc = size;
 888   set->nelem = 0;
 889   set->elems = re_xmalloc (Idx, size);
 890   if (BE (set->elems == NULL, 0))
 891     return REG_ESPACE;
 892   return REG_NOERROR;
 893 }
 894
 895 static reg_errcode_t
 896 internal_function
 897 re_node_set_init_1 (re_node_set *set, Idx elem)
 898 {
 899   set->alloc = 1;
 900   set->nelem = 1;
 901   set->elems = re_malloc (Idx, 1);
 902   if (BE (set->elems == NULL, 0))
 903     {
 904       set->alloc = set->nelem = 0;
 905       return REG_ESPACE;
 906     }
 907   set->elems[0] = elem;
 908   return REG_NOERROR;
 909 }
 910
 911 static reg_errcode_t
 912 internal_function
 913 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 914 {
 915   set->alloc = 2;
 916   set->elems = re_malloc (Idx, 2);
 917   if (BE (set->elems == NULL, 0))
 918     return REG_ESPACE;
 919   if (elem1 == elem2)
 920     {
 921       set->nelem = 1;
 922       set->elems[0] = elem1;
 923     }
 924   else
 925     {
 926       set->nelem = 2;
 927       if (elem1 < elem2)
 928         {
 929           set->elems[0] = elem1;
 930           set->elems[1] = elem2;
 931         }
 932       else
 933         {
 934           set->elems[0] = elem2;
 935           set->elems[1] = elem1;
 936         }
 937     }
 938   return REG_NOERROR;
 939 }
 940
 941 static reg_errcode_t
 942 internal_function
 943 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 944 {
 945   dest->nelem = src->nelem;
 946   if (src->nelem > 0)
 947     {
 948       dest->alloc = dest->nelem;
 949       dest->elems = re_malloc (Idx, dest->alloc);
 950       if (BE (dest->elems == NULL, 0))
 951         {
 952           dest->alloc = dest->nelem = 0;
 953           return REG_ESPACE;
 954         }
 955       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
 956     }
 957   else
 958     re_node_set_init_empty (dest);
 959   return REG_NOERROR;
 960 }
 961
 962 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 963    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 964    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 965
 966 static reg_errcode_t
 967 internal_function
 968 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 969                            const re_node_set *src2)
 970 {
 971   Idx i1, i2, is, id, delta, sbase;
 972   if (src1->nelem == 0 || src2->nelem == 0)
 973     return REG_NOERROR;
 974
 975   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 976      conservative estimate.  */
 977   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 978     {
 979       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
 980       Idx *new_elems;
 981       if (sizeof (Idx) < 3
 982           && (new_alloc < dest->alloc
 983               || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
 984         return REG_ESPACE;
 985       new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
 986       if (BE (new_elems == NULL, 0))
 987         return REG_ESPACE;
 988       dest->elems = new_elems;
 989       dest->alloc = new_alloc;
 990     }
 991
 992   /* Find the items in the intersection of SRC1 and SRC2, and copy
 993      into the top of DEST those that are not already in DEST itself.  */
 994   sbase = dest->nelem + src1->nelem + src2->nelem;
 995   i1 = src1->nelem - 1;
 996   i2 = src2->nelem - 1;
 997   id = dest->nelem - 1;
 998   for (;;)
 999     {
1000       if (src1->elems[i1] == src2->elems[i2])
1001         {
1002           /* Try to find the item in DEST.  Maybe we could binary search?  */
1003           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1004             --id;
1005
1006           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1007             dest->elems[--sbase] = src1->elems[i1];
1008
1009           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1010             break;
1011         }
1012
1013       /* Lower the highest of the two items.  */
1014       else if (src1->elems[i1] < src2->elems[i2])
1015         {
1016           if (! REG_VALID_INDEX (--i2))
1017             break;
1018         }
1019       else
1020         {
1021           if (! REG_VALID_INDEX (--i1))
1022             break;
1023         }
1024     }
1025
1026   id = dest->nelem - 1;
1027   is = dest->nelem + src1->nelem + src2->nelem - 1;
1028   delta = is - sbase + 1;
1029
1030   /* Now copy.  When DELTA becomes zero, the remaining
1031      DEST elements are already in place; this is more or
1032      less the same loop that is in re_node_set_merge.  */
1033   dest->nelem += delta;
1034   if (delta > 0 && REG_VALID_INDEX (id))
1035     for (;;)
1036       {
1037         if (dest->elems[is] > dest->elems[id])
1038           {
1039             /* Copy from the top.  */
1040             dest->elems[id + delta--] = dest->elems[is--];
1041             if (delta == 0)
1042               break;
1043           }
1044         else
1045           {
1046             /* Slide from the bottom.  */
1047             dest->elems[id + delta] = dest->elems[id];
1048             if (! REG_VALID_INDEX (--id))
1049               break;
1050           }
1051       }
1052
1053   /* Copy remaining SRC elements.  */
1054   memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1055
1056   return REG_NOERROR;
1057 }
1058
1059 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1060    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1061
1062 static reg_errcode_t
1063 internal_function
1064 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1065                         const re_node_set *src2)
1066 {
1067   Idx i1, i2, id;
1068   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1069     {
1070       dest->alloc = src1->nelem + src2->nelem;
1071       if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1072         return REG_ESPACE;
1073       dest->elems = re_xmalloc (Idx, dest->alloc);
1074       if (BE (dest->elems == NULL, 0))
1075         return REG_ESPACE;
1076     }
1077   else
1078     {
1079       if (src1 != NULL && src1->nelem > 0)
1080         return re_node_set_init_copy (dest, src1);
1081       else if (src2 != NULL && src2->nelem > 0)
1082         return re_node_set_init_copy (dest, src2);
1083       else
1084         re_node_set_init_empty (dest);
1085       return REG_NOERROR;
1086     }
1087   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1088     {
1089       if (src1->elems[i1] > src2->elems[i2])
1090         {
1091           dest->elems[id++] = src2->elems[i2++];
1092           continue;
1093         }
1094       if (src1->elems[i1] == src2->elems[i2])
1095         ++i2;
1096       dest->elems[id++] = src1->elems[i1++];
1097     }
1098   if (i1 < src1->nelem)
1099     {
1100       memcpy (dest->elems + id, src1->elems + i1,
1101              (src1->nelem - i1) * sizeof dest->elems[0]);
1102       id += src1->nelem - i1;
1103     }
1104   else if (i2 < src2->nelem)
1105     {
1106       memcpy (dest->elems + id, src2->elems + i2,
1107              (src2->nelem - i2) * sizeof dest->elems[0]);
1108       id += src2->nelem - i2;
1109     }
1110   dest->nelem = id;
1111   return REG_NOERROR;
1112 }
1113
1114 /* Calculate the union set of the sets DEST and SRC. And store it to
1115    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1116
1117 static reg_errcode_t
1118 internal_function
1119 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1120 {
1121   Idx is, id, sbase, delta;
1122   if (src == NULL || src->nelem == 0)
1123     return REG_NOERROR;
1124   if (sizeof (Idx) < 3
1125       && ((Idx) (2 * src->nelem) < src->nelem
1126           || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1127     return REG_ESPACE;
1128   if (dest->alloc < 2 * src->nelem + dest->nelem)
1129     {
1130       Idx new_alloc = src->nelem + dest->alloc;
1131       Idx *new_buffer;
1132       if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1133         return REG_ESPACE;
1134       new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1135       if (BE (new_buffer == NULL, 0))
1136         return REG_ESPACE;
1137       dest->elems = new_buffer;
1138       dest->alloc = new_alloc;
1139     }
1140
1141   if (BE (dest->nelem == 0, 0))
1142     {
1143       dest->nelem = src->nelem;
1144       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1145       return REG_NOERROR;
1146     }
1147
1148   /* Copy into the top of DEST the items of SRC that are not
1149      found in DEST.  Maybe we could binary search in DEST?  */
1150   for (sbase = dest->nelem + 2 * src->nelem,
1151        is = src->nelem - 1, id = dest->nelem - 1;
1152        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1153     {
1154       if (dest->elems[id] == src->elems[is])
1155         is--, id--;
1156       else if (dest->elems[id] < src->elems[is])
1157         dest->elems[--sbase] = src->elems[is--];
1158       else /* if (dest->elems[id] > src->elems[is]) */
1159         --id;
1160     }
1161
1162   if (REG_VALID_INDEX (is))
1163     {
1164       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1165       sbase -= is + 1;
1166       memcpy (dest->elems + sbase, src->elems,
1167               (is + 1) * sizeof dest->elems[0]);
1168     }
1169
1170   id = dest->nelem - 1;
1171   is = dest->nelem + 2 * src->nelem - 1;
1172   delta = is - sbase + 1;
1173   if (delta == 0)
1174     return REG_NOERROR;
1175
1176   /* Now copy.  When DELTA becomes zero, the remaining
1177      DEST elements are already in place.  */
1178   dest->nelem += delta;
1179   for (;;)
1180     {
1181       if (dest->elems[is] > dest->elems[id])
1182         {
1183           /* Copy from the top.  */
1184           dest->elems[id + delta--] = dest->elems[is--];
1185           if (delta == 0)
1186             break;
1187         }
1188       else
1189         {
1190           /* Slide from the bottom.  */
1191           dest->elems[id + delta] = dest->elems[id];
1192           if (! REG_VALID_INDEX (--id))
1193             {
1194               /* Copy remaining SRC elements.  */
1195               memcpy (dest->elems, dest->elems + sbase,
1196                       delta * sizeof dest->elems[0]);
1197               break;
1198             }
1199         }
1200     }
1201
1202   return REG_NOERROR;
1203 }
1204
1205 /* Insert the new element ELEM to the re_node_set* SET.
1206    SET should not already have ELEM.
1207    Return true if successful.  */
1208
1209 static bool
1210 internal_function
1211 re_node_set_insert (re_node_set *set, Idx elem)
1212 {
1213   Idx idx;
1214   /* In case the set is empty.  */
1215   if (set->alloc == 0)
1216     return re_node_set_init_1 (set, elem) == REG_NOERROR;
1217
1218   if (BE (set->nelem, 0) == 0)
1219     {
1220       /* We already guaranteed above that set->alloc != 0.  */
1221       set->elems[0] = elem;
1222       ++set->nelem;
1223       return true;
1224     }
1225
1226   /* Realloc if we need.  */
1227   if (set->alloc == set->nelem)
1228     {
1229       Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1230       if (BE (new_elems == NULL, 0))
1231         return false;
1232       set->elems = new_elems;
1233     }
1234
1235   /* Move the elements which follows the new element.  Test the
1236      first element separately to skip a check in the inner loop.  */
1237   if (elem < set->elems[0])
1238     {
1239       idx = 0;
1240       for (idx = set->nelem; idx > 0; idx--)
1241         set->elems[idx] = set->elems[idx - 1];
1242     }
1243   else
1244     {
1245       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1246         set->elems[idx] = set->elems[idx - 1];
1247     }
1248
1249   /* Insert the new element.  */
1250   set->elems[idx] = elem;
1251   ++set->nelem;
1252   return true;
1253 }
1254
1255 /* Insert the new element ELEM to the re_node_set* SET.
1256    SET should not already have any element greater than or equal to ELEM.
1257    Return true if successful.  */
1258
1259 static bool
1260 internal_function
1261 re_node_set_insert_last (re_node_set *set, Idx elem)
1262 {
1263   /* Realloc if we need.  */
1264   if (set->alloc == set->nelem)
1265     {
1266       Idx *new_elems;
1267       new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1268       if (BE (new_elems == NULL, 0))
1269         return false;
1270       set->elems = new_elems;
1271     }
1272
1273   /* Insert the new element.  */
1274   set->elems[set->nelem++] = elem;
1275   return true;
1276 }
1277
1278 /* Compare two node sets SET1 and SET2.
1279    Return true if SET1 and SET2 are equivalent.  */
1280
1281 static bool
1282 internal_function __attribute ((pure))
1283 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1284 {
1285   Idx i;
1286   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1287     return false;
1288   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1289     if (set1->elems[i] != set2->elems[i])
1290       return false;
1291   return true;
1292 }
1293
1294 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1295
1296 static Idx
1297 internal_function __attribute ((pure))
1298 re_node_set_contains (const re_node_set *set, Idx elem)
1299 {
1300   __re_size_t idx, right, mid;
1301   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1302     return 0;
1303
1304   /* Binary search the element.  */
1305   idx = 0;
1306   right = set->nelem - 1;
1307   while (idx < right)
1308     {
1309       mid = (idx + right) / 2;
1310       if (set->elems[mid] < elem)
1311         idx = mid + 1;
1312       else
1313         right = mid;
1314     }
1315   return set->elems[idx] == elem ? idx + 1 : 0;
1316 }
1317
1318 static void
1319 internal_function
1320 re_node_set_remove_at (re_node_set *set, Idx idx)
1321 {
1322   if (idx < 0 || idx >= set->nelem)
1323     return;
1324   --set->nelem;
1325   for (; idx < set->nelem; idx++)
1326     set->elems[idx] = set->elems[idx + 1];
1327 }
1328 \f
1329
1330 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1331    Or return REG_MISSING if an error occurred.  */
1332
1333 static Idx
1334 internal_function
1335 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1336 {
1337   int type = token.type;
1338   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1339     {
1340       Idx new_nodes_alloc = dfa->nodes_alloc;
1341       Idx *new_nexts, *new_indices;
1342       re_node_set *new_edests, *new_eclosures;
1343
1344       re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1345                                             &new_nodes_alloc);
1346       if (BE (new_nodes == NULL, 0))
1347         return REG_MISSING;
1348       dfa->nodes = new_nodes;
1349       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1350       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1351       new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1352       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1353       if (BE (new_nexts == NULL || new_indices == NULL
1354               || new_edests == NULL || new_eclosures == NULL, 0))
1355         return REG_MISSING;
1356       dfa->nexts = new_nexts;
1357       dfa->org_indices = new_indices;
1358       dfa->edests = new_edests;
1359       dfa->eclosures = new_eclosures;
1360       dfa->nodes_alloc = new_nodes_alloc;
1361     }
1362   dfa->nodes[dfa->nodes_len] = token;
1363   dfa->nodes[dfa->nodes_len].constraint = 0;
1364 #ifdef RE_ENABLE_I18N
1365   dfa->nodes[dfa->nodes_len].accept_mb =
1366     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1367 #endif
1368   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1369   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1370   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1371   return dfa->nodes_len++;
1372 }
1373
1374 static inline re_hashval_t
1375 internal_function
1376 calc_state_hash (const re_node_set *nodes, unsigned int context)
1377 {
1378   re_hashval_t hash = nodes->nelem + context;
1379   Idx i;
1380   for (i = 0 ; i < nodes->nelem ; i++)
1381     hash += nodes->elems[i];
1382   return hash;
1383 }
1384
1385 /* Search for the state whose node_set is equivalent to NODES.
1386    Return the pointer to the state, if we found it in the DFA.
1387    Otherwise create the new one and return it.  In case of an error
1388    return NULL and set the error code in ERR.
1389    Note: - We assume NULL as the invalid state, then it is possible that
1390            return value is NULL and ERR is REG_NOERROR.
1391          - We never return non-NULL value in case of any errors, it is for
1392            optimization.  */
1393
1394 static re_dfastate_t*
1395 internal_function
1396 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1397 {
1398   re_hashval_t hash;
1399   re_dfastate_t *new_state;
1400   struct re_state_table_entry *spot;
1401   Idx i;
1402 #ifdef lint
1403   /* Suppress bogus uninitialized-variable warnings.  */
1404   *err = REG_NOERROR;
1405 #endif
1406   if (BE (nodes->nelem == 0, 0))
1407     {
1408       *err = REG_NOERROR;
1409       return NULL;
1410     }
1411   hash = calc_state_hash (nodes, 0);
1412   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1413
1414   for (i = 0 ; i < spot->num ; i++)
1415     {
1416       re_dfastate_t *state = spot->array[i];
1417       if (hash != state->hash)
1418         continue;
1419       if (re_node_set_compare (&state->nodes, nodes))
1420         return state;
1421     }
1422
1423   /* There are no appropriate state in the dfa, create the new one.  */
1424   new_state = create_ci_newstate (dfa, nodes, hash);
1425   if (BE (new_state != NULL, 1))
1426     return new_state;
1427   else
1428     {
1429       *err = REG_ESPACE;
1430       return NULL;
1431     }
1432 }
1433
1434 /* Search for the state whose node_set is equivalent to NODES and
1435    whose context is equivalent to CONTEXT.
1436    Return the pointer to the state, if we found it in the DFA.
1437    Otherwise create the new one and return it.  In case of an error
1438    return NULL and set the error code in ERR.
1439    Note: - We assume NULL as the invalid state, then it is possible that
1440            return value is NULL and ERR is REG_NOERROR.
1441          - We never return non-NULL value in case of any errors, it is for
1442            optimization.  */
1443
1444 static re_dfastate_t*
1445 internal_function
1446 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1447                           const re_node_set *nodes, unsigned int context)
1448 {
1449   re_hashval_t hash;
1450   re_dfastate_t *new_state;
1451   struct re_state_table_entry *spot;
1452   Idx i;
1453 #ifdef lint
1454   /* Suppress bogus uninitialized-variable warnings.  */
1455   *err = REG_NOERROR;
1456 #endif
1457   if (nodes->nelem == 0)
1458     {
1459       *err = REG_NOERROR;
1460       return NULL;
1461     }
1462   hash = calc_state_hash (nodes, context);
1463   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1464
1465   for (i = 0 ; i < spot->num ; i++)
1466     {
1467       re_dfastate_t *state = spot->array[i];
1468       if (state->hash == hash
1469           && state->context == context
1470           && re_node_set_compare (state->entrance_nodes, nodes))
1471         return state;
1472     }
1473   /* There are no appropriate state in `dfa', create the new one.  */
1474   new_state = create_cd_newstate (dfa, nodes, context, hash);
1475   if (BE (new_state != NULL, 1))
1476     return new_state;
1477   else
1478     {
1479       *err = REG_ESPACE;
1480       return NULL;
1481     }
1482 }
1483
1484 /* Finish initialization of the new state NEWSTATE, and using its hash value
1485    HASH put in the appropriate bucket of DFA's state table.  Return value
1486    indicates the error code if failed.  */
1487
1488 static reg_errcode_t
1489 internal_function
1490 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1491 {
1492   struct re_state_table_entry *spot;
1493   reg_errcode_t err;
1494   Idx i;
1495
1496   newstate->hash = hash;
1497   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1498   if (BE (err != REG_NOERROR, 0))
1499     return REG_ESPACE;
1500   for (i = 0; i < newstate->nodes.nelem; i++)
1501     {
1502       Idx elem = newstate->nodes.elems[i];
1503       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1504         {
1505           bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1506           if (BE (! ok, 0))
1507             return REG_ESPACE;
1508         }
1509     }
1510
1511   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1512   if (BE (spot->alloc <= spot->num, 0))
1513     {
1514       Idx new_alloc = spot->num;
1515       re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1516                                                 &new_alloc);
1517       if (BE (new_array == NULL, 0))
1518         return REG_ESPACE;
1519       spot->array = new_array;
1520       spot->alloc = new_alloc;
1521     }
1522   spot->array[spot->num++] = newstate;
1523   return REG_NOERROR;
1524 }
1525
1526 /* Create the new state which is independ of contexts.
1527    Return the new state if succeeded, otherwise return NULL.  */
1528
1529 static re_dfastate_t *
1530 internal_function
1531 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1532                     re_hashval_t hash)
1533 {
1534   Idx i;
1535   reg_errcode_t err;
1536   re_dfastate_t *newstate;
1537
1538   newstate = re_calloc (re_dfastate_t, 1);
1539   if (BE (newstate == NULL, 0))
1540     return NULL;
1541   err = re_node_set_init_copy (&newstate->nodes, nodes);
1542   if (BE (err != REG_NOERROR, 0))
1543     {
1544       re_free (newstate);
1545       return NULL;
1546     }
1547
1548   newstate->entrance_nodes = &newstate->nodes;
1549   for (i = 0 ; i < nodes->nelem ; i++)
1550     {
1551       re_token_t *node = dfa->nodes + nodes->elems[i];
1552       re_token_type_t type = node->type;
1553       if (type == CHARACTER && !node->constraint)
1554         continue;
1555 #ifdef RE_ENABLE_I18N
1556       newstate->accept_mb |= node->accept_mb;
1557 #endif /* RE_ENABLE_I18N */
1558
1559       /* If the state has the halt node, the state is a halt state.  */
1560       if (type == END_OF_RE)
1561         newstate->halt = 1;
1562       else if (type == OP_BACK_REF)
1563         newstate->has_backref = 1;
1564       else if (type == ANCHOR || node->constraint)
1565         newstate->has_constraint = 1;
1566     }
1567   err = register_state (dfa, newstate, hash);
1568   if (BE (err != REG_NOERROR, 0))
1569     {
1570       free_state (newstate);
1571       newstate = NULL;
1572     }
1573   return newstate;
1574 }
1575
1576 /* Create the new state which is depend on the context CONTEXT.
1577    Return the new state if succeeded, otherwise return NULL.  */
1578
1579 static re_dfastate_t *
1580 internal_function
1581 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1582                     unsigned int context, re_hashval_t hash)
1583 {
1584   Idx i, nctx_nodes = 0;
1585   reg_errcode_t err;
1586   re_dfastate_t *newstate;
1587
1588   newstate = re_calloc (re_dfastate_t, 1);
1589   if (BE (newstate == NULL, 0))
1590     return NULL;
1591   err = re_node_set_init_copy (&newstate->nodes, nodes);
1592   if (BE (err != REG_NOERROR, 0))
1593     {
1594       re_free (newstate);
1595       return NULL;
1596     }
1597
1598   newstate->context = context;
1599   newstate->entrance_nodes = &newstate->nodes;
1600
1601   for (i = 0 ; i < nodes->nelem ; i++)
1602     {
1603       unsigned int constraint = 0;
1604       re_token_t *node = dfa->nodes + nodes->elems[i];
1605       re_token_type_t type = node->type;
1606       if (node->constraint)
1607         constraint = node->constraint;
1608
1609       if (type == CHARACTER && !constraint)
1610         continue;
1611 #ifdef RE_ENABLE_I18N
1612       newstate->accept_mb |= node->accept_mb;
1613 #endif /* RE_ENABLE_I18N */
1614
1615       /* If the state has the halt node, the state is a halt state.  */
1616       if (type == END_OF_RE)
1617         newstate->halt = 1;
1618       else if (type == OP_BACK_REF)
1619         newstate->has_backref = 1;
1620       else if (type == ANCHOR)
1621         constraint = node->opr.ctx_type;
1622
1623       if (constraint)
1624         {
1625           if (newstate->entrance_nodes == &newstate->nodes)
1626             {
1627               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1628               if (BE (newstate->entrance_nodes == NULL, 0))
1629                 {
1630                   free_state (newstate);
1631                   return NULL;
1632                 }
1633               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1634               nctx_nodes = 0;
1635               newstate->has_constraint = 1;
1636             }
1637
1638           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1639             {
1640               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1641               ++nctx_nodes;
1642             }
1643         }
1644     }
1645   err = register_state (dfa, newstate, hash);
1646   if (BE (err != REG_NOERROR, 0))
1647     {
1648       free_state (newstate);
1649       newstate = NULL;
1650     }
1651   return  newstate;
1652 }
1653
1654 static void
1655 internal_function
1656 free_state (re_dfastate_t *state)
1657 {
1658   re_node_set_free (&state->non_eps_nodes);
1659   re_node_set_free (&state->inveclosure);
1660   if (state->entrance_nodes != &state->nodes)
1661     {
1662       re_node_set_free (state->entrance_nodes);
1663       re_free (state->entrance_nodes);
1664     }
1665   re_node_set_free (&state->nodes);
1666   re_free (state->word_trtable);
1667   re_free (state->trtable);
1668   re_free (state);
1669 }