lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash) internal_function;
  31 \f
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 internal_function
  39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  40                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   Idx init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 internal_function
  67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  68                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif /* RE_ENABLE_I18N  */
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 internal_function
 130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs;
 136
 137       /* Avoid overflow.  */
 138       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 139       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 140         return REG_ESPACE;
 141
 142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 143       if (BE (new_wcs == NULL, 0))
 144         return REG_ESPACE;
 145       pstr->wcs = new_wcs;
 146       if (pstr->offsets != NULL)
 147         {
 148           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 149           if (BE (new_offsets == NULL, 0))
 150             return REG_ESPACE;
 151           pstr->offsets = new_offsets;
 152         }
 153     }
 154 #endif /* RE_ENABLE_I18N  */
 155   if (pstr->mbs_allocated)
 156     {
 157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 158                                            new_buf_len);
 159       if (BE (new_mbs == NULL, 0))
 160         return REG_ESPACE;
 161       pstr->mbs = new_mbs;
 162     }
 163   pstr->bufs_len = new_buf_len;
 164   return REG_NOERROR;
 165 }
 166
 167
 168 static void
 169 internal_function
 170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 171                             RE_TRANSLATE_TYPE trans, bool icase,
 172                             const re_dfa_t *dfa)
 173 {
 174   pstr->raw_mbs = (const unsigned char *) str;
 175   pstr->len = len;
 176   pstr->raw_len = len;
 177   pstr->trans = trans;
 178   pstr->icase = icase;
 179   pstr->mbs_allocated = (trans != NULL || icase);
 180   pstr->mb_cur_max = dfa->mb_cur_max;
 181   pstr->is_utf8 = dfa->is_utf8;
 182   pstr->map_notascii = dfa->map_notascii;
 183   pstr->stop = pstr->len;
 184   pstr->raw_stop = pstr->stop;
 185 }
 186
 187 #ifdef RE_ENABLE_I18N
 188
 189 /* Build wide character buffer PSTR->WCS.
 190    If the byte sequence of the string are:
 191      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 192    Then wide character buffer will be:
 193      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 194    We use WEOF for padding, they indicate that the position isn't
 195    a first byte of a multibyte character.
 196
 197    Note that this function assumes PSTR->VALID_LEN elements are already
 198    built and starts from PSTR->VALID_LEN.  */
 199
 200 static void
 201 internal_function
 202 build_wcs_buffer (re_string_t *pstr)
 203 {
 204 #ifdef _LIBC
 205   unsigned char buf[MB_LEN_MAX];
 206   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 207 #else
 208   unsigned char buf[64];
 209 #endif
 210   mbstate_t prev_st;
 211   Idx byte_idx, end_idx, remain_len;
 212   size_t mbclen;
 213
 214   /* Build the buffers from pstr->valid_len to either pstr->len or
 215      pstr->bufs_len.  */
 216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 218     {
 219       wchar_t wc;
 220       const char *p;
 221
 222       remain_len = end_idx - byte_idx;
 223       prev_st = pstr->cur_state;
 224       /* Apply the translation if we need.  */
 225       if (BE (pstr->trans != NULL, 0))
 226         {
 227           int i, ch;
 228
 229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 230             {
 231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 233             }
 234           p = (const char *) buf;
 235         }
 236       else
 237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 238       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 239       if (BE (mbclen == (size_t) -2, 0))
 240         {
 241           /* The buffer doesn't have enough space, finish to build.  */
 242           pstr->cur_state = prev_st;
 243           break;
 244         }
 245       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 246         {
 247           /* We treat these cases as a singlebyte character.  */
 248           mbclen = 1;
 249           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 250           if (BE (pstr->trans != NULL, 0))
 251             wc = pstr->trans[wc];
 252           pstr->cur_state = prev_st;
 253         }
 254
 255       /* Write wide character and padding.  */
 256       pstr->wcs[byte_idx++] = wc;
 257       /* Write paddings.  */
 258       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 259         pstr->wcs[byte_idx++] = WEOF;
 260     }
 261   pstr->valid_len = byte_idx;
 262   pstr->valid_raw_len = byte_idx;
 263 }
 264
 265 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 266    but for REG_ICASE.  */
 267
 268 static reg_errcode_t
 269 internal_function
 270 build_wcs_upper_buffer (re_string_t *pstr)
 271 {
 272   mbstate_t prev_st;
 273   Idx src_idx, byte_idx, end_idx, remain_len;
 274   size_t mbclen;
 275 #ifdef _LIBC
 276   char buf[MB_LEN_MAX];
 277   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 278 #else
 279   char buf[64];
 280 #endif
 281
 282   byte_idx = pstr->valid_len;
 283   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 284
 285   /* The following optimization assumes that ASCII characters can be
 286      mapped to wide characters with a simple cast.  */
 287   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 288     {
 289       while (byte_idx < end_idx)
 290         {
 291           wchar_t wc;
 292
 293           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 294               && mbsinit (&pstr->cur_state))
 295             {
 296               /* In case of a singlebyte character.  */
 297               pstr->mbs[byte_idx]
 298                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 299               /* The next step uses the assumption that wchar_t is encoded
 300                  ASCII-safe: all ASCII values can be converted like this.  */
 301               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 302               ++byte_idx;
 303               continue;
 304             }
 305
 306           remain_len = end_idx - byte_idx;
 307           prev_st = pstr->cur_state;
 308           mbclen = mbrtowc (&wc,
 309                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 310                              + byte_idx), remain_len, &pstr->cur_state);
 311           if (BE (mbclen < (size_t) -2, 1))
 312             {
 313               wchar_t wcu = wc;
 314               if (iswlower (wc))
 315                 {
 316                   size_t mbcdlen;
 317
 318                   wcu = towupper (wc);
 319                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 320                   if (BE (mbclen == mbcdlen, 1))
 321                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 322                   else
 323                     {
 324                       src_idx = byte_idx;
 325                       goto offsets_needed;
 326                     }
 327                 }
 328               else
 329                 memcpy (pstr->mbs + byte_idx,
 330                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 331               pstr->wcs[byte_idx++] = wcu;
 332               /* Write paddings.  */
 333               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 334                 pstr->wcs[byte_idx++] = WEOF;
 335             }
 336           else if (mbclen == (size_t) -1 || mbclen == 0)
 337             {
 338               /* It is an invalid character or '\0'.  Just use the byte.  */
 339               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 340               pstr->mbs[byte_idx] = ch;
 341               /* And also cast it to wide char.  */
 342               pstr->wcs[byte_idx++] = (wchar_t) ch;
 343               if (BE (mbclen == (size_t) -1, 0))
 344                 pstr->cur_state = prev_st;
 345             }
 346           else
 347             {
 348               /* The buffer doesn't have enough space, finish to build.  */
 349               pstr->cur_state = prev_st;
 350               break;
 351             }
 352         }
 353       pstr->valid_len = byte_idx;
 354       pstr->valid_raw_len = byte_idx;
 355       return REG_NOERROR;
 356     }
 357   else
 358     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 359       {
 360         wchar_t wc;
 361         const char *p;
 362       offsets_needed:
 363         remain_len = end_idx - byte_idx;
 364         prev_st = pstr->cur_state;
 365         if (BE (pstr->trans != NULL, 0))
 366           {
 367             int i, ch;
 368
 369             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 370               {
 371                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 372                 buf[i] = pstr->trans[ch];
 373               }
 374             p = (const char *) buf;
 375           }
 376         else
 377           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 378         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 379         if (BE (mbclen < (size_t) -2, 1))
 380           {
 381             wchar_t wcu = wc;
 382             if (iswlower (wc))
 383               {
 384                 size_t mbcdlen;
 385
 386                 wcu = towupper (wc);
 387                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 388                 if (BE (mbclen == mbcdlen, 1))
 389                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 390                 else if (mbcdlen != (size_t) -1)
 391                   {
 392                     size_t i;
 393
 394                     if (byte_idx + mbcdlen > pstr->bufs_len)
 395                       {
 396                         pstr->cur_state = prev_st;
 397                         break;
 398                       }
 399
 400                     if (pstr->offsets == NULL)
 401                       {
 402                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 403
 404                         if (pstr->offsets == NULL)
 405                           return REG_ESPACE;
 406                       }
 407                     if (!pstr->offsets_needed)
 408                       {
 409                         for (i = 0; i < (size_t) byte_idx; ++i)
 410                           pstr->offsets[i] = i;
 411                         pstr->offsets_needed = 1;
 412                       }
 413
 414                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 415                     pstr->wcs[byte_idx] = wcu;
 416                     pstr->offsets[byte_idx] = src_idx;
 417                     for (i = 1; i < mbcdlen; ++i)
 418                       {
 419                         pstr->offsets[byte_idx + i]
 420                           = src_idx + (i < mbclen ? i : mbclen - 1);
 421                         pstr->wcs[byte_idx + i] = WEOF;
 422                       }
 423                     pstr->len += mbcdlen - mbclen;
 424                     if (pstr->raw_stop > src_idx)
 425                       pstr->stop += mbcdlen - mbclen;
 426                     end_idx = (pstr->bufs_len > pstr->len)
 427                               ? pstr->len : pstr->bufs_len;
 428                     byte_idx += mbcdlen;
 429                     src_idx += mbclen;
 430                     continue;
 431                   }
 432                 else
 433                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 434               }
 435             else
 436               memcpy (pstr->mbs + byte_idx, p, mbclen);
 437
 438             if (BE (pstr->offsets_needed != 0, 0))
 439               {
 440                 size_t i;
 441                 for (i = 0; i < mbclen; ++i)
 442                   pstr->offsets[byte_idx + i] = src_idx + i;
 443               }
 444             src_idx += mbclen;
 445
 446             pstr->wcs[byte_idx++] = wcu;
 447             /* Write paddings.  */
 448             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 449               pstr->wcs[byte_idx++] = WEOF;
 450           }
 451         else if (mbclen == (size_t) -1 || mbclen == 0)
 452           {
 453             /* It is an invalid character or '\0'.  Just use the byte.  */
 454             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 455
 456             if (BE (pstr->trans != NULL, 0))
 457               ch = pstr->trans [ch];
 458             pstr->mbs[byte_idx] = ch;
 459
 460             if (BE (pstr->offsets_needed != 0, 0))
 461               pstr->offsets[byte_idx] = src_idx;
 462             ++src_idx;
 463
 464             /* And also cast it to wide char.  */
 465             pstr->wcs[byte_idx++] = (wchar_t) ch;
 466             if (BE (mbclen == (size_t) -1, 0))
 467               pstr->cur_state = prev_st;
 468           }
 469         else
 470           {
 471             /* The buffer doesn't have enough space, finish to build.  */
 472             pstr->cur_state = prev_st;
 473             break;
 474           }
 475       }
 476   pstr->valid_len = byte_idx;
 477   pstr->valid_raw_len = src_idx;
 478   return REG_NOERROR;
 479 }
 480
 481 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 482    Return the index.  */
 483
 484 static Idx
 485 internal_function
 486 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 487 {
 488   mbstate_t prev_st;
 489   Idx rawbuf_idx;
 490   size_t mbclen;
 491   wchar_t wc = WEOF;
 492
 493   /* Skip the characters which are not necessary to check.  */
 494   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 495        rawbuf_idx < new_raw_idx;)
 496     {
 497       Idx remain_len;
 498       remain_len = pstr->len - rawbuf_idx;
 499       prev_st = pstr->cur_state;
 500       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 501                         remain_len, &pstr->cur_state);
 502       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 503         {
 504           /* We treat these cases as a single byte character.  */
 505           if (mbclen == 0 || remain_len == 0)
 506             wc = L'\0';
 507           else
 508             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 509           mbclen = 1;
 510           pstr->cur_state = prev_st;
 511         }
 512       /* Then proceed the next character.  */
 513       rawbuf_idx += mbclen;
 514     }
 515   *last_wc = (wint_t) wc;
 516   return rawbuf_idx;
 517 }
 518 #endif /* RE_ENABLE_I18N  */
 519
 520 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 521    This function is used in case of REG_ICASE.  */
 522
 523 static void
 524 internal_function
 525 build_upper_buffer (re_string_t *pstr)
 526 {
 527   Idx char_idx, end_idx;
 528   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 529
 530   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 531     {
 532       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 533       if (BE (pstr->trans != NULL, 0))
 534         ch = pstr->trans[ch];
 535       if (islower (ch))
 536         pstr->mbs[char_idx] = toupper (ch);
 537       else
 538         pstr->mbs[char_idx] = ch;
 539     }
 540   pstr->valid_len = char_idx;
 541   pstr->valid_raw_len = char_idx;
 542 }
 543
 544 /* Apply TRANS to the buffer in PSTR.  */
 545
 546 static void
 547 internal_function
 548 re_string_translate_buffer (re_string_t *pstr)
 549 {
 550   Idx buf_idx, end_idx;
 551   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 552
 553   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 554     {
 555       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 556       pstr->mbs[buf_idx] = pstr->trans[ch];
 557     }
 558
 559   pstr->valid_len = buf_idx;
 560   pstr->valid_raw_len = buf_idx;
 561 }
 562
 563 /* This function re-construct the buffers.
 564    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 565    convert to upper case in case of REG_ICASE, apply translation.  */
 566
 567 static reg_errcode_t
 568 internal_function
 569 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 570 {
 571   Idx offset;
 572
 573   if (BE (pstr->raw_mbs_idx <= idx, 0))
 574     offset = idx - pstr->raw_mbs_idx;
 575   else
 576     {
 577       /* Reset buffer.  */
 578 #ifdef RE_ENABLE_I18N
 579       if (pstr->mb_cur_max > 1)
 580         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 581 #endif /* RE_ENABLE_I18N */
 582       pstr->len = pstr->raw_len;
 583       pstr->stop = pstr->raw_stop;
 584       pstr->valid_len = 0;
 585       pstr->raw_mbs_idx = 0;
 586       pstr->valid_raw_len = 0;
 587       pstr->offsets_needed = 0;
 588       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 589                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 590       if (!pstr->mbs_allocated)
 591         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 592       offset = idx;
 593     }
 594
 595   if (BE (offset != 0, 1))
 596     {
 597       /* Are the characters which are already checked remain?  */
 598       if (BE (offset < pstr->valid_raw_len, 1)
 599 #ifdef RE_ENABLE_I18N
 600           /* Handling this would enlarge the code too much.
 601              Accept a slowdown in that case.  */
 602           && pstr->offsets_needed == 0
 603 #endif
 604          )
 605         {
 606           /* Yes, move them to the front of the buffer.  */
 607           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 608 #ifdef RE_ENABLE_I18N
 609           if (pstr->mb_cur_max > 1)
 610             memmove (pstr->wcs, pstr->wcs + offset,
 611                      (pstr->valid_len - offset) * sizeof (wint_t));
 612 #endif /* RE_ENABLE_I18N */
 613           if (BE (pstr->mbs_allocated, 0))
 614             memmove (pstr->mbs, pstr->mbs + offset,
 615                      pstr->valid_len - offset);
 616           pstr->valid_len -= offset;
 617           pstr->valid_raw_len -= offset;
 618 #if DEBUG
 619           assert (pstr->valid_len > 0);
 620 #endif
 621         }
 622       else
 623         {
 624           /* No, skip all characters until IDX.  */
 625 #ifdef RE_ENABLE_I18N
 626           if (BE (pstr->offsets_needed, 0))
 627             {
 628               pstr->len = pstr->raw_len - idx + offset;
 629               pstr->stop = pstr->raw_stop - idx + offset;
 630               pstr->offsets_needed = 0;
 631             }
 632 #endif
 633           pstr->valid_len = 0;
 634 #ifdef RE_ENABLE_I18N
 635           if (pstr->mb_cur_max > 1)
 636             {
 637               Idx wcs_idx;
 638               wint_t wc = WEOF;
 639
 640               if (pstr->is_utf8)
 641                 {
 642                   const unsigned char *raw, *p, *q, *end;
 643
 644                   /* Special case UTF-8.  Multi-byte chars start with any
 645                      byte other than 0x80 - 0xbf.  */
 646                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 647                   end = raw + (offset - pstr->mb_cur_max);
 648                   p = raw + offset - 1;
 649 #ifdef _LIBC
 650                   /* We know the wchar_t encoding is UCS4, so for the simple
 651                      case, ASCII characters, skip the conversion step.  */
 652                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 653                     {
 654                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 655                       pstr->valid_len = 0;
 656                       wc = (wchar_t) *p;
 657                     }
 658                   else
 659 #endif
 660                     for (; p >= end; --p)
 661                       if ((*p & 0xc0) != 0x80)
 662                         {
 663                           mbstate_t cur_state;
 664                           wchar_t wc2;
 665                           Idx mlen = raw + pstr->len - p;
 666                           unsigned char buf[6];
 667                           size_t mbclen;
 668
 669                           q = p;
 670                           if (BE (pstr->trans != NULL, 0))
 671                             {
 672                               int i = mlen < 6 ? mlen : 6;
 673                               while (--i >= 0)
 674                                 buf[i] = pstr->trans[p[i]];
 675                               q = buf;
 676                             }
 677                           /* XXX Don't use mbrtowc, we know which conversion
 678                              to use (UTF-8 -> UCS4).  */
 679                           memset (&cur_state, 0, sizeof (cur_state));
 680                           mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 681                                             &cur_state);
 682                           if (raw + offset - p <= mbclen
 683                               && mbclen < (size_t) -2)
 684                             {
 685                               memset (&pstr->cur_state, '\0',
 686                                       sizeof (mbstate_t));
 687                               pstr->valid_len = mbclen - (raw + offset - p);
 688                               wc = wc2;
 689                             }
 690                           break;
 691                         }
 692                 }
 693
 694               if (wc == WEOF)
 695                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 696               if (wc == WEOF)
 697                 pstr->tip_context
 698                   = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
 699               else
 700                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 701                                       && IS_WIDE_WORD_CHAR (wc))
 702                                      ? CONTEXT_WORD
 703                                      : ((IS_WIDE_NEWLINE (wc)
 704                                          && pstr->newline_anchor)
 705                                         ? CONTEXT_NEWLINE : 0));
 706               if (BE (pstr->valid_len, 0))
 707                 {
 708                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 709                     pstr->wcs[wcs_idx] = WEOF;
 710                   if (pstr->mbs_allocated)
 711                     memset (pstr->mbs, -1, pstr->valid_len);
 712                 }
 713               pstr->valid_raw_len = pstr->valid_len;
 714             }
 715           else
 716 #endif /* RE_ENABLE_I18N */
 717             {
 718               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 719               pstr->valid_raw_len = 0;
 720               if (pstr->trans)
 721                 c = pstr->trans[c];
 722               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 723                                    ? CONTEXT_WORD
 724                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 725                                       ? CONTEXT_NEWLINE : 0));
 726             }
 727         }
 728       if (!BE (pstr->mbs_allocated, 0))
 729         pstr->mbs += offset;
 730     }
 731   pstr->raw_mbs_idx = idx;
 732   pstr->len -= offset;
 733   pstr->stop -= offset;
 734
 735   /* Then build the buffers.  */
 736 #ifdef RE_ENABLE_I18N
 737   if (pstr->mb_cur_max > 1)
 738     {
 739       if (pstr->icase)
 740         {
 741           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 742           if (BE (ret != REG_NOERROR, 0))
 743             return ret;
 744         }
 745       else
 746         build_wcs_buffer (pstr);
 747     }
 748   else
 749 #endif /* RE_ENABLE_I18N */
 750     if (BE (pstr->mbs_allocated, 0))
 751       {
 752         if (pstr->icase)
 753           build_upper_buffer (pstr);
 754         else if (pstr->trans != NULL)
 755           re_string_translate_buffer (pstr);
 756       }
 757     else
 758       pstr->valid_len = pstr->len;
 759
 760   pstr->cur_idx = 0;
 761   return REG_NOERROR;
 762 }
 763
 764 static unsigned char
 765 internal_function __attribute ((pure))
 766 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 767 {
 768   int ch;
 769   Idx off;
 770
 771   /* Handle the common (easiest) cases first.  */
 772   if (BE (!pstr->mbs_allocated, 1))
 773     return re_string_peek_byte (pstr, idx);
 774
 775 #ifdef RE_ENABLE_I18N
 776   if (pstr->mb_cur_max > 1
 777       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 778     return re_string_peek_byte (pstr, idx);
 779 #endif
 780
 781   off = pstr->cur_idx + idx;
 782 #ifdef RE_ENABLE_I18N
 783   if (pstr->offsets_needed)
 784     off = pstr->offsets[off];
 785 #endif
 786
 787   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 788
 789 #ifdef RE_ENABLE_I18N
 790   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 791      this function returns CAPITAL LETTER I instead of first byte of
 792      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 793      since peek_byte_case doesn't advance cur_idx in any way.  */
 794   if (pstr->offsets_needed && !isascii (ch))
 795     return re_string_peek_byte (pstr, idx);
 796 #endif
 797
 798   return ch;
 799 }
 800
 801 static unsigned char
 802 internal_function __attribute ((pure))
 803 re_string_fetch_byte_case (re_string_t *pstr)
 804 {
 805   if (BE (!pstr->mbs_allocated, 1))
 806     return re_string_fetch_byte (pstr);
 807
 808 #ifdef RE_ENABLE_I18N
 809   if (pstr->offsets_needed)
 810     {
 811       Idx off;
 812       int ch;
 813
 814       /* For tr_TR.UTF-8 [[:islower:]] there is
 815          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 816          in that case the whole multi-byte character and return
 817          the original letter.  On the other side, with
 818          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 819          anything else would complicate things too much.  */
 820
 821       if (!re_string_first_byte (pstr, pstr->cur_idx))
 822         return re_string_fetch_byte (pstr);
 823
 824       off = pstr->offsets[pstr->cur_idx];
 825       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 826
 827       if (! isascii (ch))
 828         return re_string_fetch_byte (pstr);
 829
 830       re_string_skip_bytes (pstr,
 831                             re_string_char_size_at (pstr, pstr->cur_idx));
 832       return ch;
 833     }
 834 #endif
 835
 836   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 837 }
 838
 839 static void
 840 internal_function
 841 re_string_destruct (re_string_t *pstr)
 842 {
 843 #ifdef RE_ENABLE_I18N
 844   re_free (pstr->wcs);
 845   re_free (pstr->offsets);
 846 #endif /* RE_ENABLE_I18N  */
 847   if (pstr->mbs_allocated)
 848     re_free (pstr->mbs);
 849 }
 850
 851 /* Return the context at IDX in INPUT.  */
 852
 853 static unsigned int
 854 internal_function
 855 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 856 {
 857   int c;
 858   if (BE (! REG_VALID_INDEX (idx), 0))
 859     /* In this case, we use the value stored in input->tip_context,
 860        since we can't know the character in input->mbs[-1] here.  */
 861     return input->tip_context;
 862   if (BE (idx == input->len, 0))
 863     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 864             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 865 #ifdef RE_ENABLE_I18N
 866   if (input->mb_cur_max > 1)
 867     {
 868       wint_t wc;
 869       Idx wc_idx = idx;
 870       while(input->wcs[wc_idx] == WEOF)
 871         {
 872 #ifdef DEBUG
 873           /* It must not happen.  */
 874           assert (REG_VALID_INDEX (wc_idx));
 875 #endif
 876           --wc_idx;
 877           if (! REG_VALID_INDEX (wc_idx))
 878             return input->tip_context;
 879         }
 880       wc = input->wcs[wc_idx];
 881       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 882         return CONTEXT_WORD;
 883       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 884               ? CONTEXT_NEWLINE : 0);
 885     }
 886   else
 887 #endif
 888     {
 889       c = re_string_byte_at (input, idx);
 890       if (bitset_contain (input->word_char, c))
 891         return CONTEXT_WORD;
 892       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 893     }
 894 }
 895 \f
 896 /* Functions for set operation.  */
 897
 898 static reg_errcode_t
 899 internal_function
 900 re_node_set_alloc (re_node_set *set, Idx size)
 901 {
 902   set->alloc = size;
 903   set->nelem = 0;
 904   set->elems = re_malloc (Idx, size);
 905   if (BE (set->elems == NULL, 0))
 906     return REG_ESPACE;
 907   return REG_NOERROR;
 908 }
 909
 910 static reg_errcode_t
 911 internal_function
 912 re_node_set_init_1 (re_node_set *set, Idx elem)
 913 {
 914   set->alloc = 1;
 915   set->nelem = 1;
 916   set->elems = re_malloc (Idx, 1);
 917   if (BE (set->elems == NULL, 0))
 918     {
 919       set->alloc = set->nelem = 0;
 920       return REG_ESPACE;
 921     }
 922   set->elems[0] = elem;
 923   return REG_NOERROR;
 924 }
 925
 926 static reg_errcode_t
 927 internal_function
 928 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 929 {
 930   set->alloc = 2;
 931   set->elems = re_malloc (Idx, 2);
 932   if (BE (set->elems == NULL, 0))
 933     return REG_ESPACE;
 934   if (elem1 == elem2)
 935     {
 936       set->nelem = 1;
 937       set->elems[0] = elem1;
 938     }
 939   else
 940     {
 941       set->nelem = 2;
 942       if (elem1 < elem2)
 943         {
 944           set->elems[0] = elem1;
 945           set->elems[1] = elem2;
 946         }
 947       else
 948         {
 949           set->elems[0] = elem2;
 950           set->elems[1] = elem1;
 951         }
 952     }
 953   return REG_NOERROR;
 954 }
 955
 956 static reg_errcode_t
 957 internal_function
 958 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 959 {
 960   dest->nelem = src->nelem;
 961   if (src->nelem > 0)
 962     {
 963       dest->alloc = dest->nelem;
 964       dest->elems = re_malloc (Idx, dest->alloc);
 965       if (BE (dest->elems == NULL, 0))
 966         {
 967           dest->alloc = dest->nelem = 0;
 968           return REG_ESPACE;
 969         }
 970       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
 971     }
 972   else
 973     re_node_set_init_empty (dest);
 974   return REG_NOERROR;
 975 }
 976
 977 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 978    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 979    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 980
 981 static reg_errcode_t
 982 internal_function
 983 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 984                            const re_node_set *src2)
 985 {
 986   Idx i1, i2, is, id, delta, sbase;
 987   if (src1->nelem == 0 || src2->nelem == 0)
 988     return REG_NOERROR;
 989
 990   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 991      conservative estimate.  */
 992   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 993     {
 994       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
 995       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
 996       if (BE (new_elems == NULL, 0))
 997         return REG_ESPACE;
 998       dest->elems = new_elems;
 999       dest->alloc = new_alloc;
1000     }
1001
1002   /* Find the items in the intersection of SRC1 and SRC2, and copy
1003      into the top of DEST those that are not already in DEST itself.  */
1004   sbase = dest->nelem + src1->nelem + src2->nelem;
1005   i1 = src1->nelem - 1;
1006   i2 = src2->nelem - 1;
1007   id = dest->nelem - 1;
1008   for (;;)
1009     {
1010       if (src1->elems[i1] == src2->elems[i2])
1011         {
1012           /* Try to find the item in DEST.  Maybe we could binary search?  */
1013           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1014             --id;
1015
1016           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1017             dest->elems[--sbase] = src1->elems[i1];
1018
1019           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1020             break;
1021         }
1022
1023       /* Lower the highest of the two items.  */
1024       else if (src1->elems[i1] < src2->elems[i2])
1025         {
1026           if (! REG_VALID_INDEX (--i2))
1027             break;
1028         }
1029       else
1030         {
1031           if (! REG_VALID_INDEX (--i1))
1032             break;
1033         }
1034     }
1035
1036   id = dest->nelem - 1;
1037   is = dest->nelem + src1->nelem + src2->nelem - 1;
1038   delta = is - sbase + 1;
1039
1040   /* Now copy.  When DELTA becomes zero, the remaining
1041      DEST elements are already in place; this is more or
1042      less the same loop that is in re_node_set_merge.  */
1043   dest->nelem += delta;
1044   if (delta > 0 && REG_VALID_INDEX (id))
1045     for (;;)
1046       {
1047         if (dest->elems[is] > dest->elems[id])
1048           {
1049             /* Copy from the top.  */
1050             dest->elems[id + delta--] = dest->elems[is--];
1051             if (delta == 0)
1052               break;
1053           }
1054         else
1055           {
1056             /* Slide from the bottom.  */
1057             dest->elems[id + delta] = dest->elems[id];
1058             if (! REG_VALID_INDEX (--id))
1059               break;
1060           }
1061       }
1062
1063   /* Copy remaining SRC elements.  */
1064   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1065
1066   return REG_NOERROR;
1067 }
1068
1069 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1070    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1071
1072 static reg_errcode_t
1073 internal_function
1074 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1075                         const re_node_set *src2)
1076 {
1077   Idx i1, i2, id;
1078   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1079     {
1080       dest->alloc = src1->nelem + src2->nelem;
1081       dest->elems = re_malloc (Idx, dest->alloc);
1082       if (BE (dest->elems == NULL, 0))
1083         return REG_ESPACE;
1084     }
1085   else
1086     {
1087       if (src1 != NULL && src1->nelem > 0)
1088         return re_node_set_init_copy (dest, src1);
1089       else if (src2 != NULL && src2->nelem > 0)
1090         return re_node_set_init_copy (dest, src2);
1091       else
1092         re_node_set_init_empty (dest);
1093       return REG_NOERROR;
1094     }
1095   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1096     {
1097       if (src1->elems[i1] > src2->elems[i2])
1098         {
1099           dest->elems[id++] = src2->elems[i2++];
1100           continue;
1101         }
1102       if (src1->elems[i1] == src2->elems[i2])
1103         ++i2;
1104       dest->elems[id++] = src1->elems[i1++];
1105     }
1106   if (i1 < src1->nelem)
1107     {
1108       memcpy (dest->elems + id, src1->elems + i1,
1109              (src1->nelem - i1) * sizeof (Idx));
1110       id += src1->nelem - i1;
1111     }
1112   else if (i2 < src2->nelem)
1113     {
1114       memcpy (dest->elems + id, src2->elems + i2,
1115              (src2->nelem - i2) * sizeof (Idx));
1116       id += src2->nelem - i2;
1117     }
1118   dest->nelem = id;
1119   return REG_NOERROR;
1120 }
1121
1122 /* Calculate the union set of the sets DEST and SRC. And store it to
1123    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1124
1125 static reg_errcode_t
1126 internal_function
1127 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1128 {
1129   Idx is, id, sbase, delta;
1130   if (src == NULL || src->nelem == 0)
1131     return REG_NOERROR;
1132   if (dest->alloc < 2 * src->nelem + dest->nelem)
1133     {
1134       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1135       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1136       if (BE (new_buffer == NULL, 0))
1137         return REG_ESPACE;
1138       dest->elems = new_buffer;
1139       dest->alloc = new_alloc;
1140     }
1141
1142   if (BE (dest->nelem == 0, 0))
1143     {
1144       dest->nelem = src->nelem;
1145       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1146       return REG_NOERROR;
1147     }
1148
1149   /* Copy into the top of DEST the items of SRC that are not
1150      found in DEST.  Maybe we could binary search in DEST?  */
1151   for (sbase = dest->nelem + 2 * src->nelem,
1152        is = src->nelem - 1, id = dest->nelem - 1;
1153        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1154     {
1155       if (dest->elems[id] == src->elems[is])
1156         is--, id--;
1157       else if (dest->elems[id] < src->elems[is])
1158         dest->elems[--sbase] = src->elems[is--];
1159       else /* if (dest->elems[id] > src->elems[is]) */
1160         --id;
1161     }
1162
1163   if (REG_VALID_INDEX (is))
1164     {
1165       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1166       sbase -= is + 1;
1167       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1168     }
1169
1170   id = dest->nelem - 1;
1171   is = dest->nelem + 2 * src->nelem - 1;
1172   delta = is - sbase + 1;
1173   if (delta == 0)
1174     return REG_NOERROR;
1175
1176   /* Now copy.  When DELTA becomes zero, the remaining
1177      DEST elements are already in place.  */
1178   dest->nelem += delta;
1179   for (;;)
1180     {
1181       if (dest->elems[is] > dest->elems[id])
1182         {
1183           /* Copy from the top.  */
1184           dest->elems[id + delta--] = dest->elems[is--];
1185           if (delta == 0)
1186             break;
1187         }
1188       else
1189         {
1190           /* Slide from the bottom.  */
1191           dest->elems[id + delta] = dest->elems[id];
1192           if (! REG_VALID_INDEX (--id))
1193             {
1194               /* Copy remaining SRC elements.  */
1195               memcpy (dest->elems, dest->elems + sbase,
1196                       delta * sizeof (Idx));
1197               break;
1198             }
1199         }
1200     }
1201
1202   return REG_NOERROR;
1203 }
1204
1205 /* Insert the new element ELEM to the re_node_set* SET.
1206    SET should not already have ELEM.
1207    Return true if successful.  */
1208
1209 static bool
1210 internal_function
1211 re_node_set_insert (re_node_set *set, Idx elem)
1212 {
1213   Idx idx;
1214   /* In case the set is empty.  */
1215   if (set->alloc == 0)
1216     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1217
1218   if (BE (set->nelem, 0) == 0)
1219     {
1220       /* We already guaranteed above that set->alloc != 0.  */
1221       set->elems[0] = elem;
1222       ++set->nelem;
1223       return true;
1224     }
1225
1226   /* Realloc if we need.  */
1227   if (set->alloc == set->nelem)
1228     {
1229       Idx *new_elems;
1230       set->alloc = set->alloc * 2;
1231       new_elems = re_realloc (set->elems, Idx, set->alloc);
1232       if (BE (new_elems == NULL, 0))
1233         return false;
1234       set->elems = new_elems;
1235     }
1236
1237   /* Move the elements which follows the new element.  Test the
1238      first element separately to skip a check in the inner loop.  */
1239   if (elem < set->elems[0])
1240     {
1241       idx = 0;
1242       for (idx = set->nelem; idx > 0; idx--)
1243         set->elems[idx] = set->elems[idx - 1];
1244     }
1245   else
1246     {
1247       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1248         set->elems[idx] = set->elems[idx - 1];
1249     }
1250
1251   /* Insert the new element.  */
1252   set->elems[idx] = elem;
1253   ++set->nelem;
1254   return true;
1255 }
1256
1257 /* Insert the new element ELEM to the re_node_set* SET.
1258    SET should not already have any element greater than or equal to ELEM.
1259    Return true if successful.  */
1260
1261 static bool
1262 internal_function
1263 re_node_set_insert_last (re_node_set *set, Idx elem)
1264 {
1265   /* Realloc if we need.  */
1266   if (set->alloc == set->nelem)
1267     {
1268       Idx *new_elems;
1269       set->alloc = (set->alloc + 1) * 2;
1270       new_elems = re_realloc (set->elems, Idx, set->alloc);
1271       if (BE (new_elems == NULL, 0))
1272         return false;
1273       set->elems = new_elems;
1274     }
1275
1276   /* Insert the new element.  */
1277   set->elems[set->nelem++] = elem;
1278   return true;
1279 }
1280
1281 /* Compare two node sets SET1 and SET2.
1282    Return true if SET1 and SET2 are equivalent.  */
1283
1284 static bool
1285 internal_function __attribute ((pure))
1286 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1287 {
1288   Idx i;
1289   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1290     return false;
1291   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1292     if (set1->elems[i] != set2->elems[i])
1293       return false;
1294   return true;
1295 }
1296
1297 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1298
1299 static Idx
1300 internal_function __attribute ((pure))
1301 re_node_set_contains (const re_node_set *set, Idx elem)
1302 {
1303   __re_size_t idx, right, mid;
1304   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1305     return 0;
1306
1307   /* Binary search the element.  */
1308   idx = 0;
1309   right = set->nelem - 1;
1310   while (idx < right)
1311     {
1312       mid = (idx + right) / 2;
1313       if (set->elems[mid] < elem)
1314         idx = mid + 1;
1315       else
1316         right = mid;
1317     }
1318   return set->elems[idx] == elem ? idx + 1 : 0;
1319 }
1320
1321 static void
1322 internal_function
1323 re_node_set_remove_at (re_node_set *set, Idx idx)
1324 {
1325   if (idx < 0 || idx >= set->nelem)
1326     return;
1327   --set->nelem;
1328   for (; idx < set->nelem; idx++)
1329     set->elems[idx] = set->elems[idx + 1];
1330 }
1331 \f
1332
1333 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1334    Or return REG_MISSING if an error occurred.  */
1335
1336 static Idx
1337 internal_function
1338 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1339 {
1340   int type = token.type;
1341   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1342     {
1343       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1344       Idx *new_nexts, *new_indices;
1345       re_node_set *new_edests, *new_eclosures;
1346       re_token_t *new_nodes;
1347       size_t max_object_size =
1348         MAX (sizeof (re_token_t),
1349              MAX (sizeof (re_node_set),
1350                   sizeof (Idx)));
1351
1352       /* Avoid overflows.  */
1353       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1354         return REG_MISSING;
1355
1356       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1357       if (BE (new_nodes == NULL, 0))
1358         return REG_MISSING;
1359       dfa->nodes = new_nodes;
1360       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1361       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1362       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1363       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1364       if (BE (new_nexts == NULL || new_indices == NULL
1365               || new_edests == NULL || new_eclosures == NULL, 0))
1366         return REG_MISSING;
1367       dfa->nexts = new_nexts;
1368       dfa->org_indices = new_indices;
1369       dfa->edests = new_edests;
1370       dfa->eclosures = new_eclosures;
1371       dfa->nodes_alloc = new_nodes_alloc;
1372     }
1373   dfa->nodes[dfa->nodes_len] = token;
1374   dfa->nodes[dfa->nodes_len].constraint = 0;
1375 #ifdef RE_ENABLE_I18N
1376   dfa->nodes[dfa->nodes_len].accept_mb =
1377     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1378 #endif
1379   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1380   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1381   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1382   return dfa->nodes_len++;
1383 }
1384
1385 static inline re_hashval_t
1386 internal_function
1387 calc_state_hash (const re_node_set *nodes, unsigned int context)
1388 {
1389   re_hashval_t hash = nodes->nelem + context;
1390   Idx i;
1391   for (i = 0 ; i < nodes->nelem ; i++)
1392     hash += nodes->elems[i];
1393   return hash;
1394 }
1395
1396 /* Search for the state whose node_set is equivalent to NODES.
1397    Return the pointer to the state, if we found it in the DFA.
1398    Otherwise create the new one and return it.  In case of an error
1399    return NULL and set the error code in ERR.
1400    Note: - We assume NULL as the invalid state, then it is possible that
1401            return value is NULL and ERR is REG_NOERROR.
1402          - We never return non-NULL value in case of any errors, it is for
1403            optimization.  */
1404
1405 static re_dfastate_t *
1406 internal_function
1407 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1408                   const re_node_set *nodes)
1409 {
1410   re_hashval_t hash;
1411   re_dfastate_t *new_state;
1412   struct re_state_table_entry *spot;
1413   Idx i;
1414 #ifdef lint
1415   /* Suppress bogus uninitialized-variable warnings.  */
1416   *err = REG_NOERROR;
1417 #endif
1418   if (BE (nodes->nelem == 0, 0))
1419     {
1420       *err = REG_NOERROR;
1421       return NULL;
1422     }
1423   hash = calc_state_hash (nodes, 0);
1424   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1425
1426   for (i = 0 ; i < spot->num ; i++)
1427     {
1428       re_dfastate_t *state = spot->array[i];
1429       if (hash != state->hash)
1430         continue;
1431       if (re_node_set_compare (&state->nodes, nodes))
1432         return state;
1433     }
1434
1435   /* There are no appropriate state in the dfa, create the new one.  */
1436   new_state = create_ci_newstate (dfa, nodes, hash);
1437   if (BE (new_state == NULL, 0))
1438     *err = REG_ESPACE;
1439
1440   return new_state;
1441 }
1442
1443 /* Search for the state whose node_set is equivalent to NODES and
1444    whose context is equivalent to CONTEXT.
1445    Return the pointer to the state, if we found it in the DFA.
1446    Otherwise create the new one and return it.  In case of an error
1447    return NULL and set the error code in ERR.
1448    Note: - We assume NULL as the invalid state, then it is possible that
1449            return value is NULL and ERR is REG_NOERROR.
1450          - We never return non-NULL value in case of any errors, it is for
1451            optimization.  */
1452
1453 static re_dfastate_t *
1454 internal_function
1455 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1456                           const re_node_set *nodes, unsigned int context)
1457 {
1458   re_hashval_t hash;
1459   re_dfastate_t *new_state;
1460   struct re_state_table_entry *spot;
1461   Idx i;
1462 #ifdef lint
1463   /* Suppress bogus uninitialized-variable warnings.  */
1464   *err = REG_NOERROR;
1465 #endif
1466   if (nodes->nelem == 0)
1467     {
1468       *err = REG_NOERROR;
1469       return NULL;
1470     }
1471   hash = calc_state_hash (nodes, context);
1472   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1473
1474   for (i = 0 ; i < spot->num ; i++)
1475     {
1476       re_dfastate_t *state = spot->array[i];
1477       if (state->hash == hash
1478           && state->context == context
1479           && re_node_set_compare (state->entrance_nodes, nodes))
1480         return state;
1481     }
1482   /* There are no appropriate state in `dfa', create the new one.  */
1483   new_state = create_cd_newstate (dfa, nodes, context, hash);
1484   if (BE (new_state == NULL, 0))
1485     *err = REG_ESPACE;
1486
1487   return new_state;
1488 }
1489
1490 /* Finish initialization of the new state NEWSTATE, and using its hash value
1491    HASH put in the appropriate bucket of DFA's state table.  Return value
1492    indicates the error code if failed.  */
1493
1494 static reg_errcode_t
1495 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1496                 re_hashval_t hash)
1497 {
1498   struct re_state_table_entry *spot;
1499   reg_errcode_t err;
1500   Idx i;
1501
1502   newstate->hash = hash;
1503   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1504   if (BE (err != REG_NOERROR, 0))
1505     return REG_ESPACE;
1506   for (i = 0; i < newstate->nodes.nelem; i++)
1507     {
1508       Idx elem = newstate->nodes.elems[i];
1509       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1510         if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1511           return REG_ESPACE;
1512     }
1513
1514   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1515   if (BE (spot->alloc <= spot->num, 0))
1516     {
1517       Idx new_alloc = 2 * spot->num + 2;
1518       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1519                                               new_alloc);
1520       if (BE (new_array == NULL, 0))
1521         return REG_ESPACE;
1522       spot->array = new_array;
1523       spot->alloc = new_alloc;
1524     }
1525   spot->array[spot->num++] = newstate;
1526   return REG_NOERROR;
1527 }
1528
1529 static void
1530 free_state (re_dfastate_t *state)
1531 {
1532   re_node_set_free (&state->non_eps_nodes);
1533   re_node_set_free (&state->inveclosure);
1534   if (state->entrance_nodes != &state->nodes)
1535     {
1536       re_node_set_free (state->entrance_nodes);
1537       re_free (state->entrance_nodes);
1538     }
1539   re_node_set_free (&state->nodes);
1540   re_free (state->word_trtable);
1541   re_free (state->trtable);
1542   re_free (state);
1543 }
1544
1545 /* Create the new state which is independ of contexts.
1546    Return the new state if succeeded, otherwise return NULL.  */
1547
1548 static re_dfastate_t *
1549 internal_function
1550 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1551                     re_hashval_t hash)
1552 {
1553   Idx i;
1554   reg_errcode_t err;
1555   re_dfastate_t *newstate;
1556
1557   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1558   if (BE (newstate == NULL, 0))
1559     return NULL;
1560   err = re_node_set_init_copy (&newstate->nodes, nodes);
1561   if (BE (err != REG_NOERROR, 0))
1562     {
1563       re_free (newstate);
1564       return NULL;
1565     }
1566
1567   newstate->entrance_nodes = &newstate->nodes;
1568   for (i = 0 ; i < nodes->nelem ; i++)
1569     {
1570       re_token_t *node = dfa->nodes + nodes->elems[i];
1571       re_token_type_t type = node->type;
1572       if (type == CHARACTER && !node->constraint)
1573         continue;
1574 #ifdef RE_ENABLE_I18N
1575       newstate->accept_mb |= node->accept_mb;
1576 #endif /* RE_ENABLE_I18N */
1577
1578       /* If the state has the halt node, the state is a halt state.  */
1579       if (type == END_OF_RE)
1580         newstate->halt = 1;
1581       else if (type == OP_BACK_REF)
1582         newstate->has_backref = 1;
1583       else if (type == ANCHOR || node->constraint)
1584         newstate->has_constraint = 1;
1585     }
1586   err = register_state (dfa, newstate, hash);
1587   if (BE (err != REG_NOERROR, 0))
1588     {
1589       free_state (newstate);
1590       newstate = NULL;
1591     }
1592   return newstate;
1593 }
1594
1595 /* Create the new state which is depend on the context CONTEXT.
1596    Return the new state if succeeded, otherwise return NULL.  */
1597
1598 static re_dfastate_t *
1599 internal_function
1600 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1601                     unsigned int context, re_hashval_t hash)
1602 {
1603   Idx i, nctx_nodes = 0;
1604   reg_errcode_t err;
1605   re_dfastate_t *newstate;
1606
1607   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1608   if (BE (newstate == NULL, 0))
1609     return NULL;
1610   err = re_node_set_init_copy (&newstate->nodes, nodes);
1611   if (BE (err != REG_NOERROR, 0))
1612     {
1613       re_free (newstate);
1614       return NULL;
1615     }
1616
1617   newstate->context = context;
1618   newstate->entrance_nodes = &newstate->nodes;
1619
1620   for (i = 0 ; i < nodes->nelem ; i++)
1621     {
1622       unsigned int constraint = 0;
1623       re_token_t *node = dfa->nodes + nodes->elems[i];
1624       re_token_type_t type = node->type;
1625       if (node->constraint)
1626         constraint = node->constraint;
1627
1628       if (type == CHARACTER && !constraint)
1629         continue;
1630 #ifdef RE_ENABLE_I18N
1631       newstate->accept_mb |= node->accept_mb;
1632 #endif /* RE_ENABLE_I18N */
1633
1634       /* If the state has the halt node, the state is a halt state.  */
1635       if (type == END_OF_RE)
1636         newstate->halt = 1;
1637       else if (type == OP_BACK_REF)
1638         newstate->has_backref = 1;
1639       else if (type == ANCHOR)
1640         constraint = node->opr.ctx_type;
1641
1642       if (constraint)
1643         {
1644           if (newstate->entrance_nodes == &newstate->nodes)
1645             {
1646               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1647               if (BE (newstate->entrance_nodes == NULL, 0))
1648                 {
1649                   free_state (newstate);
1650                   return NULL;
1651                 }
1652               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1653               nctx_nodes = 0;
1654               newstate->has_constraint = 1;
1655             }
1656
1657           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1658             {
1659               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1660               ++nctx_nodes;
1661             }
1662         }
1663     }
1664   err = register_state (dfa, newstate, hash);
1665   if (BE (err != REG_NOERROR, 0))
1666     {
1667       free_state (newstate);
1668       newstate = NULL;
1669     }
1670   return  newstate;
1671 }