lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash) internal_function;
  31 \f
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 internal_function
  39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  40                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   Idx init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 internal_function
  67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  68                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif /* RE_ENABLE_I18N  */
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 internal_function
 130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs;
 136
 137       /* Avoid overflow.  */
 138       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 139       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 140         return REG_ESPACE;
 141
 142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 143       if (BE (new_wcs == NULL, 0))
 144         return REG_ESPACE;
 145       pstr->wcs = new_wcs;
 146       if (pstr->offsets != NULL)
 147         {
 148           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 149           if (BE (new_offsets == NULL, 0))
 150             return REG_ESPACE;
 151           pstr->offsets = new_offsets;
 152         }
 153     }
 154 #endif /* RE_ENABLE_I18N  */
 155   if (pstr->mbs_allocated)
 156     {
 157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 158                                            new_buf_len);
 159       if (BE (new_mbs == NULL, 0))
 160         return REG_ESPACE;
 161       pstr->mbs = new_mbs;
 162     }
 163   pstr->bufs_len = new_buf_len;
 164   return REG_NOERROR;
 165 }
 166
 167
 168 static void
 169 internal_function
 170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 171                             RE_TRANSLATE_TYPE trans, bool icase,
 172                             const re_dfa_t *dfa)
 173 {
 174   pstr->raw_mbs = (const unsigned char *) str;
 175   pstr->len = len;
 176   pstr->raw_len = len;
 177   pstr->trans = trans;
 178   pstr->icase = icase;
 179   pstr->mbs_allocated = (trans != NULL || icase);
 180   pstr->mb_cur_max = dfa->mb_cur_max;
 181   pstr->is_utf8 = dfa->is_utf8;
 182   pstr->map_notascii = dfa->map_notascii;
 183   pstr->stop = pstr->len;
 184   pstr->raw_stop = pstr->stop;
 185 }
 186
 187 #ifdef RE_ENABLE_I18N
 188
 189 /* Build wide character buffer PSTR->WCS.
 190    If the byte sequence of the string are:
 191      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 192    Then wide character buffer will be:
 193      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 194    We use WEOF for padding, they indicate that the position isn't
 195    a first byte of a multibyte character.
 196
 197    Note that this function assumes PSTR->VALID_LEN elements are already
 198    built and starts from PSTR->VALID_LEN.  */
 199
 200 static void
 201 internal_function
 202 build_wcs_buffer (re_string_t *pstr)
 203 {
 204 #ifdef _LIBC
 205   unsigned char buf[MB_LEN_MAX];
 206   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 207 #else
 208   unsigned char buf[64];
 209 #endif
 210   mbstate_t prev_st;
 211   Idx byte_idx, end_idx, remain_len;
 212   size_t mbclen;
 213
 214   /* Build the buffers from pstr->valid_len to either pstr->len or
 215      pstr->bufs_len.  */
 216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 218     {
 219       wchar_t wc;
 220       const char *p;
 221
 222       remain_len = end_idx - byte_idx;
 223       prev_st = pstr->cur_state;
 224       /* Apply the translation if we need.  */
 225       if (BE (pstr->trans != NULL, 0))
 226         {
 227           int i, ch;
 228
 229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 230             {
 231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 233             }
 234           p = (const char *) buf;
 235         }
 236       else
 237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 238       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 239       if (BE (mbclen == (size_t) -2, 0))
 240         {
 241           /* The buffer doesn't have enough space, finish to build.  */
 242           pstr->cur_state = prev_st;
 243           break;
 244         }
 245       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 246         {
 247           /* We treat these cases as a singlebyte character.  */
 248           mbclen = 1;
 249           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 250           if (BE (pstr->trans != NULL, 0))
 251             wc = pstr->trans[wc];
 252           pstr->cur_state = prev_st;
 253         }
 254
 255       /* Write wide character and padding.  */
 256       pstr->wcs[byte_idx++] = wc;
 257       /* Write paddings.  */
 258       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 259         pstr->wcs[byte_idx++] = WEOF;
 260     }
 261   pstr->valid_len = byte_idx;
 262   pstr->valid_raw_len = byte_idx;
 263 }
 264
 265 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 266    but for REG_ICASE.  */
 267
 268 static reg_errcode_t
 269 internal_function
 270 build_wcs_upper_buffer (re_string_t *pstr)
 271 {
 272   mbstate_t prev_st;
 273   Idx src_idx, byte_idx, end_idx, remain_len;
 274   size_t mbclen;
 275 #ifdef _LIBC
 276   char buf[MB_LEN_MAX];
 277   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 278 #else
 279   char buf[64];
 280 #endif
 281
 282   byte_idx = pstr->valid_len;
 283   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 284
 285   /* The following optimization assumes that ASCII characters can be
 286      mapped to wide characters with a simple cast.  */
 287   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 288     {
 289       while (byte_idx < end_idx)
 290         {
 291           wchar_t wc;
 292
 293           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 294               && mbsinit (&pstr->cur_state))
 295             {
 296               /* In case of a singlebyte character.  */
 297               pstr->mbs[byte_idx]
 298                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 299               /* The next step uses the assumption that wchar_t is encoded
 300                  ASCII-safe: all ASCII values can be converted like this.  */
 301               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 302               ++byte_idx;
 303               continue;
 304             }
 305
 306           remain_len = end_idx - byte_idx;
 307           prev_st = pstr->cur_state;
 308           mbclen = mbrtowc (&wc,
 309                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 310                              + byte_idx), remain_len, &pstr->cur_state);
 311           if (BE (mbclen < (size_t) -2, 1))
 312             {
 313               wchar_t wcu = wc;
 314               if (iswlower (wc))
 315                 {
 316                   size_t mbcdlen;
 317
 318                   wcu = towupper (wc);
 319                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 320                   if (BE (mbclen == mbcdlen, 1))
 321                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 322                   else
 323                     {
 324                       src_idx = byte_idx;
 325                       goto offsets_needed;
 326                     }
 327                 }
 328               else
 329                 memcpy (pstr->mbs + byte_idx,
 330                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 331               pstr->wcs[byte_idx++] = wcu;
 332               /* Write paddings.  */
 333               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 334                 pstr->wcs[byte_idx++] = WEOF;
 335             }
 336           else if (mbclen == (size_t) -1 || mbclen == 0)
 337             {
 338               /* It is an invalid character or '\0'.  Just use the byte.  */
 339               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 340               pstr->mbs[byte_idx] = ch;
 341               /* And also cast it to wide char.  */
 342               pstr->wcs[byte_idx++] = (wchar_t) ch;
 343               if (BE (mbclen == (size_t) -1, 0))
 344                 pstr->cur_state = prev_st;
 345             }
 346           else
 347             {
 348               /* The buffer doesn't have enough space, finish to build.  */
 349               pstr->cur_state = prev_st;
 350               break;
 351             }
 352         }
 353       pstr->valid_len = byte_idx;
 354       pstr->valid_raw_len = byte_idx;
 355       return REG_NOERROR;
 356     }
 357   else
 358     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 359       {
 360         wchar_t wc;
 361         const char *p;
 362       offsets_needed:
 363         remain_len = end_idx - byte_idx;
 364         prev_st = pstr->cur_state;
 365         if (BE (pstr->trans != NULL, 0))
 366           {
 367             int i, ch;
 368
 369             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 370               {
 371                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 372                 buf[i] = pstr->trans[ch];
 373               }
 374             p = (const char *) buf;
 375           }
 376         else
 377           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 378         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 379         if (BE (mbclen < (size_t) -2, 1))
 380           {
 381             wchar_t wcu = wc;
 382             if (iswlower (wc))
 383               {
 384                 size_t mbcdlen;
 385
 386                 wcu = towupper (wc);
 387                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 388                 if (BE (mbclen == mbcdlen, 1))
 389                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 390                 else if (mbcdlen != (size_t) -1)
 391                   {
 392                     size_t i;
 393
 394                     if (byte_idx + mbcdlen > pstr->bufs_len)
 395                       {
 396                         pstr->cur_state = prev_st;
 397                         break;
 398                       }
 399
 400                     if (pstr->offsets == NULL)
 401                       {
 402                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 403
 404                         if (pstr->offsets == NULL)
 405                           return REG_ESPACE;
 406                       }
 407                     if (!pstr->offsets_needed)
 408                       {
 409                         for (i = 0; i < (size_t) byte_idx; ++i)
 410                           pstr->offsets[i] = i;
 411                         pstr->offsets_needed = 1;
 412                       }
 413
 414                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 415                     pstr->wcs[byte_idx] = wcu;
 416                     pstr->offsets[byte_idx] = src_idx;
 417                     for (i = 1; i < mbcdlen; ++i)
 418                       {
 419                         pstr->offsets[byte_idx + i]
 420                           = src_idx + (i < mbclen ? i : mbclen - 1);
 421                         pstr->wcs[byte_idx + i] = WEOF;
 422                       }
 423                     pstr->len += mbcdlen - mbclen;
 424                     if (pstr->raw_stop > src_idx)
 425                       pstr->stop += mbcdlen - mbclen;
 426                     end_idx = (pstr->bufs_len > pstr->len)
 427                               ? pstr->len : pstr->bufs_len;
 428                     byte_idx += mbcdlen;
 429                     src_idx += mbclen;
 430                     continue;
 431                   }
 432                 else
 433                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 434               }
 435             else
 436               memcpy (pstr->mbs + byte_idx, p, mbclen);
 437
 438             if (BE (pstr->offsets_needed != 0, 0))
 439               {
 440                 size_t i;
 441                 for (i = 0; i < mbclen; ++i)
 442                   pstr->offsets[byte_idx + i] = src_idx + i;
 443               }
 444             src_idx += mbclen;
 445
 446             pstr->wcs[byte_idx++] = wcu;
 447             /* Write paddings.  */
 448             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 449               pstr->wcs[byte_idx++] = WEOF;
 450           }
 451         else if (mbclen == (size_t) -1 || mbclen == 0)
 452           {
 453             /* It is an invalid character or '\0'.  Just use the byte.  */
 454             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 455
 456             if (BE (pstr->trans != NULL, 0))
 457               ch = pstr->trans [ch];
 458             pstr->mbs[byte_idx] = ch;
 459
 460             if (BE (pstr->offsets_needed != 0, 0))
 461               pstr->offsets[byte_idx] = src_idx;
 462             ++src_idx;
 463
 464             /* And also cast it to wide char.  */
 465             pstr->wcs[byte_idx++] = (wchar_t) ch;
 466             if (BE (mbclen == (size_t) -1, 0))
 467               pstr->cur_state = prev_st;
 468           }
 469         else
 470           {
 471             /* The buffer doesn't have enough space, finish to build.  */
 472             pstr->cur_state = prev_st;
 473             break;
 474           }
 475       }
 476   pstr->valid_len = byte_idx;
 477   pstr->valid_raw_len = src_idx;
 478   return REG_NOERROR;
 479 }
 480
 481 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 482    Return the index.  */
 483
 484 static Idx
 485 internal_function
 486 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 487 {
 488   mbstate_t prev_st;
 489   Idx rawbuf_idx;
 490   size_t mbclen;
 491   wint_t wc = WEOF;
 492
 493   /* Skip the characters which are not necessary to check.  */
 494   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 495        rawbuf_idx < new_raw_idx;)
 496     {
 497       wchar_t wc2;
 498       Idx remain_len;
 499       remain_len = pstr->len - rawbuf_idx;
 500       prev_st = pstr->cur_state;
 501       mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 502                         remain_len, &pstr->cur_state);
 503       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 504         {
 505           /* We treat these cases as a single byte character.  */
 506           if (mbclen == 0 || remain_len == 0)
 507             wc = L'\0';
 508           else
 509             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 510           mbclen = 1;
 511           pstr->cur_state = prev_st;
 512         }
 513       else
 514         wc = wc2;
 515       /* Then proceed the next character.  */
 516       rawbuf_idx += mbclen;
 517     }
 518   *last_wc = wc;
 519   return rawbuf_idx;
 520 }
 521 #endif /* RE_ENABLE_I18N  */
 522
 523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 524    This function is used in case of REG_ICASE.  */
 525
 526 static void
 527 internal_function
 528 build_upper_buffer (re_string_t *pstr)
 529 {
 530   Idx char_idx, end_idx;
 531   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 532
 533   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 534     {
 535       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 536       if (BE (pstr->trans != NULL, 0))
 537         ch = pstr->trans[ch];
 538       if (islower (ch))
 539         pstr->mbs[char_idx] = toupper (ch);
 540       else
 541         pstr->mbs[char_idx] = ch;
 542     }
 543   pstr->valid_len = char_idx;
 544   pstr->valid_raw_len = char_idx;
 545 }
 546
 547 /* Apply TRANS to the buffer in PSTR.  */
 548
 549 static void
 550 internal_function
 551 re_string_translate_buffer (re_string_t *pstr)
 552 {
 553   Idx buf_idx, end_idx;
 554   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 555
 556   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 557     {
 558       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 559       pstr->mbs[buf_idx] = pstr->trans[ch];
 560     }
 561
 562   pstr->valid_len = buf_idx;
 563   pstr->valid_raw_len = buf_idx;
 564 }
 565
 566 /* This function re-construct the buffers.
 567    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 568    convert to upper case in case of REG_ICASE, apply translation.  */
 569
 570 static reg_errcode_t
 571 internal_function
 572 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 573 {
 574   Idx offset;
 575
 576   if (BE (pstr->raw_mbs_idx <= idx, 0))
 577     offset = idx - pstr->raw_mbs_idx;
 578   else
 579     {
 580       /* Reset buffer.  */
 581 #ifdef RE_ENABLE_I18N
 582       if (pstr->mb_cur_max > 1)
 583         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 584 #endif /* RE_ENABLE_I18N */
 585       pstr->len = pstr->raw_len;
 586       pstr->stop = pstr->raw_stop;
 587       pstr->valid_len = 0;
 588       pstr->raw_mbs_idx = 0;
 589       pstr->valid_raw_len = 0;
 590       pstr->offsets_needed = 0;
 591       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 592                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 593       if (!pstr->mbs_allocated)
 594         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 595       offset = idx;
 596     }
 597
 598   if (BE (offset != 0, 1))
 599     {
 600       /* Are the characters which are already checked remain?  */
 601       if (BE (offset < pstr->valid_raw_len, 1)
 602 #ifdef RE_ENABLE_I18N
 603           /* Handling this would enlarge the code too much.
 604              Accept a slowdown in that case.  */
 605           && pstr->offsets_needed == 0
 606 #endif
 607          )
 608         {
 609           /* Yes, move them to the front of the buffer.  */
 610           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 611 #ifdef RE_ENABLE_I18N
 612           if (pstr->mb_cur_max > 1)
 613             memmove (pstr->wcs, pstr->wcs + offset,
 614                      (pstr->valid_len - offset) * sizeof (wint_t));
 615 #endif /* RE_ENABLE_I18N */
 616           if (BE (pstr->mbs_allocated, 0))
 617             memmove (pstr->mbs, pstr->mbs + offset,
 618                      pstr->valid_len - offset);
 619           pstr->valid_len -= offset;
 620           pstr->valid_raw_len -= offset;
 621 #if DEBUG
 622           assert (pstr->valid_len > 0);
 623 #endif
 624         }
 625       else
 626         {
 627           /* No, skip all characters until IDX.  */
 628 #ifdef RE_ENABLE_I18N
 629           if (BE (pstr->offsets_needed, 0))
 630             {
 631               pstr->len = pstr->raw_len - idx + offset;
 632               pstr->stop = pstr->raw_stop - idx + offset;
 633               pstr->offsets_needed = 0;
 634             }
 635 #endif
 636           pstr->valid_len = 0;
 637 #ifdef RE_ENABLE_I18N
 638           if (pstr->mb_cur_max > 1)
 639             {
 640               Idx wcs_idx;
 641               wint_t wc = WEOF;
 642
 643               if (pstr->is_utf8)
 644                 {
 645                   const unsigned char *raw, *p, *q, *end;
 646
 647                   /* Special case UTF-8.  Multi-byte chars start with any
 648                      byte other than 0x80 - 0xbf.  */
 649                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 650                   end = raw + (offset - pstr->mb_cur_max);
 651                   p = raw + offset - 1;
 652 #ifdef _LIBC
 653                   /* We know the wchar_t encoding is UCS4, so for the simple
 654                      case, ASCII characters, skip the conversion step.  */
 655                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 656                     {
 657                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 658                       pstr->valid_len = 0;
 659                       wc = (wchar_t) *p;
 660                     }
 661                   else
 662 #endif
 663                     for (; p >= end; --p)
 664                       if ((*p & 0xc0) != 0x80)
 665                         {
 666                           mbstate_t cur_state;
 667                           wchar_t wc2;
 668                           Idx mlen = raw + pstr->len - p;
 669                           unsigned char buf[6];
 670                           size_t mbclen;
 671
 672                           q = p;
 673                           if (BE (pstr->trans != NULL, 0))
 674                             {
 675                               int i = mlen < 6 ? mlen : 6;
 676                               while (--i >= 0)
 677                                 buf[i] = pstr->trans[p[i]];
 678                               q = buf;
 679                             }
 680                           /* XXX Don't use mbrtowc, we know which conversion
 681                              to use (UTF-8 -> UCS4).  */
 682                           memset (&cur_state, 0, sizeof (cur_state));
 683                           mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 684                                             &cur_state);
 685                           if (raw + offset - p <= mbclen
 686                               && mbclen < (size_t) -2)
 687                             {
 688                               memset (&pstr->cur_state, '\0',
 689                                       sizeof (mbstate_t));
 690                               pstr->valid_len = mbclen - (raw + offset - p);
 691                               wc = wc2;
 692                             }
 693                           break;
 694                         }
 695                 }
 696
 697               if (wc == WEOF)
 698                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 699               if (wc == WEOF)
 700                 pstr->tip_context
 701                   = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
 702               else
 703                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 704                                       && IS_WIDE_WORD_CHAR (wc))
 705                                      ? CONTEXT_WORD
 706                                      : ((IS_WIDE_NEWLINE (wc)
 707                                          && pstr->newline_anchor)
 708                                         ? CONTEXT_NEWLINE : 0));
 709               if (BE (pstr->valid_len, 0))
 710                 {
 711                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 712                     pstr->wcs[wcs_idx] = WEOF;
 713                   if (pstr->mbs_allocated)
 714                     memset (pstr->mbs, -1, pstr->valid_len);
 715                 }
 716               pstr->valid_raw_len = pstr->valid_len;
 717             }
 718           else
 719 #endif /* RE_ENABLE_I18N */
 720             {
 721               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 722               pstr->valid_raw_len = 0;
 723               if (pstr->trans)
 724                 c = pstr->trans[c];
 725               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 726                                    ? CONTEXT_WORD
 727                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 728                                       ? CONTEXT_NEWLINE : 0));
 729             }
 730         }
 731       if (!BE (pstr->mbs_allocated, 0))
 732         pstr->mbs += offset;
 733     }
 734   pstr->raw_mbs_idx = idx;
 735   pstr->len -= offset;
 736   pstr->stop -= offset;
 737
 738   /* Then build the buffers.  */
 739 #ifdef RE_ENABLE_I18N
 740   if (pstr->mb_cur_max > 1)
 741     {
 742       if (pstr->icase)
 743         {
 744           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 745           if (BE (ret != REG_NOERROR, 0))
 746             return ret;
 747         }
 748       else
 749         build_wcs_buffer (pstr);
 750     }
 751   else
 752 #endif /* RE_ENABLE_I18N */
 753     if (BE (pstr->mbs_allocated, 0))
 754       {
 755         if (pstr->icase)
 756           build_upper_buffer (pstr);
 757         else if (pstr->trans != NULL)
 758           re_string_translate_buffer (pstr);
 759       }
 760     else
 761       pstr->valid_len = pstr->len;
 762
 763   pstr->cur_idx = 0;
 764   return REG_NOERROR;
 765 }
 766
 767 static unsigned char
 768 internal_function __attribute ((pure))
 769 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 770 {
 771   int ch;
 772   Idx off;
 773
 774   /* Handle the common (easiest) cases first.  */
 775   if (BE (!pstr->mbs_allocated, 1))
 776     return re_string_peek_byte (pstr, idx);
 777
 778 #ifdef RE_ENABLE_I18N
 779   if (pstr->mb_cur_max > 1
 780       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 781     return re_string_peek_byte (pstr, idx);
 782 #endif
 783
 784   off = pstr->cur_idx + idx;
 785 #ifdef RE_ENABLE_I18N
 786   if (pstr->offsets_needed)
 787     off = pstr->offsets[off];
 788 #endif
 789
 790   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 791
 792 #ifdef RE_ENABLE_I18N
 793   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 794      this function returns CAPITAL LETTER I instead of first byte of
 795      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 796      since peek_byte_case doesn't advance cur_idx in any way.  */
 797   if (pstr->offsets_needed && !isascii (ch))
 798     return re_string_peek_byte (pstr, idx);
 799 #endif
 800
 801   return ch;
 802 }
 803
 804 static unsigned char
 805 internal_function __attribute ((pure))
 806 re_string_fetch_byte_case (re_string_t *pstr)
 807 {
 808   if (BE (!pstr->mbs_allocated, 1))
 809     return re_string_fetch_byte (pstr);
 810
 811 #ifdef RE_ENABLE_I18N
 812   if (pstr->offsets_needed)
 813     {
 814       Idx off;
 815       int ch;
 816
 817       /* For tr_TR.UTF-8 [[:islower:]] there is
 818          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 819          in that case the whole multi-byte character and return
 820          the original letter.  On the other side, with
 821          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 822          anything else would complicate things too much.  */
 823
 824       if (!re_string_first_byte (pstr, pstr->cur_idx))
 825         return re_string_fetch_byte (pstr);
 826
 827       off = pstr->offsets[pstr->cur_idx];
 828       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 829
 830       if (! isascii (ch))
 831         return re_string_fetch_byte (pstr);
 832
 833       re_string_skip_bytes (pstr,
 834                             re_string_char_size_at (pstr, pstr->cur_idx));
 835       return ch;
 836     }
 837 #endif
 838
 839   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 840 }
 841
 842 static void
 843 internal_function
 844 re_string_destruct (re_string_t *pstr)
 845 {
 846 #ifdef RE_ENABLE_I18N
 847   re_free (pstr->wcs);
 848   re_free (pstr->offsets);
 849 #endif /* RE_ENABLE_I18N  */
 850   if (pstr->mbs_allocated)
 851     re_free (pstr->mbs);
 852 }
 853
 854 /* Return the context at IDX in INPUT.  */
 855
 856 static unsigned int
 857 internal_function
 858 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 859 {
 860   int c;
 861   if (BE (! REG_VALID_INDEX (idx), 0))
 862     /* In this case, we use the value stored in input->tip_context,
 863        since we can't know the character in input->mbs[-1] here.  */
 864     return input->tip_context;
 865   if (BE (idx == input->len, 0))
 866     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 867             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 868 #ifdef RE_ENABLE_I18N
 869   if (input->mb_cur_max > 1)
 870     {
 871       wint_t wc;
 872       Idx wc_idx = idx;
 873       while(input->wcs[wc_idx] == WEOF)
 874         {
 875 #ifdef DEBUG
 876           /* It must not happen.  */
 877           assert (REG_VALID_INDEX (wc_idx));
 878 #endif
 879           --wc_idx;
 880           if (! REG_VALID_INDEX (wc_idx))
 881             return input->tip_context;
 882         }
 883       wc = input->wcs[wc_idx];
 884       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 885         return CONTEXT_WORD;
 886       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 887               ? CONTEXT_NEWLINE : 0);
 888     }
 889   else
 890 #endif
 891     {
 892       c = re_string_byte_at (input, idx);
 893       if (bitset_contain (input->word_char, c))
 894         return CONTEXT_WORD;
 895       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 896     }
 897 }
 898 \f
 899 /* Functions for set operation.  */
 900
 901 static reg_errcode_t
 902 internal_function
 903 re_node_set_alloc (re_node_set *set, Idx size)
 904 {
 905   set->alloc = size;
 906   set->nelem = 0;
 907   set->elems = re_malloc (Idx, size);
 908   if (BE (set->elems == NULL, 0))
 909     return REG_ESPACE;
 910   return REG_NOERROR;
 911 }
 912
 913 static reg_errcode_t
 914 internal_function
 915 re_node_set_init_1 (re_node_set *set, Idx elem)
 916 {
 917   set->alloc = 1;
 918   set->nelem = 1;
 919   set->elems = re_malloc (Idx, 1);
 920   if (BE (set->elems == NULL, 0))
 921     {
 922       set->alloc = set->nelem = 0;
 923       return REG_ESPACE;
 924     }
 925   set->elems[0] = elem;
 926   return REG_NOERROR;
 927 }
 928
 929 static reg_errcode_t
 930 internal_function
 931 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 932 {
 933   set->alloc = 2;
 934   set->elems = re_malloc (Idx, 2);
 935   if (BE (set->elems == NULL, 0))
 936     return REG_ESPACE;
 937   if (elem1 == elem2)
 938     {
 939       set->nelem = 1;
 940       set->elems[0] = elem1;
 941     }
 942   else
 943     {
 944       set->nelem = 2;
 945       if (elem1 < elem2)
 946         {
 947           set->elems[0] = elem1;
 948           set->elems[1] = elem2;
 949         }
 950       else
 951         {
 952           set->elems[0] = elem2;
 953           set->elems[1] = elem1;
 954         }
 955     }
 956   return REG_NOERROR;
 957 }
 958
 959 static reg_errcode_t
 960 internal_function
 961 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 962 {
 963   dest->nelem = src->nelem;
 964   if (src->nelem > 0)
 965     {
 966       dest->alloc = dest->nelem;
 967       dest->elems = re_malloc (Idx, dest->alloc);
 968       if (BE (dest->elems == NULL, 0))
 969         {
 970           dest->alloc = dest->nelem = 0;
 971           return REG_ESPACE;
 972         }
 973       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
 974     }
 975   else
 976     re_node_set_init_empty (dest);
 977   return REG_NOERROR;
 978 }
 979
 980 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 981    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 982    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 983
 984 static reg_errcode_t
 985 internal_function
 986 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 987                            const re_node_set *src2)
 988 {
 989   Idx i1, i2, is, id, delta, sbase;
 990   if (src1->nelem == 0 || src2->nelem == 0)
 991     return REG_NOERROR;
 992
 993   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 994      conservative estimate.  */
 995   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 996     {
 997       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
 998       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
 999       if (BE (new_elems == NULL, 0))
1000         return REG_ESPACE;
1001       dest->elems = new_elems;
1002       dest->alloc = new_alloc;
1003     }
1004
1005   /* Find the items in the intersection of SRC1 and SRC2, and copy
1006      into the top of DEST those that are not already in DEST itself.  */
1007   sbase = dest->nelem + src1->nelem + src2->nelem;
1008   i1 = src1->nelem - 1;
1009   i2 = src2->nelem - 1;
1010   id = dest->nelem - 1;
1011   for (;;)
1012     {
1013       if (src1->elems[i1] == src2->elems[i2])
1014         {
1015           /* Try to find the item in DEST.  Maybe we could binary search?  */
1016           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1017             --id;
1018
1019           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1020             dest->elems[--sbase] = src1->elems[i1];
1021
1022           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1023             break;
1024         }
1025
1026       /* Lower the highest of the two items.  */
1027       else if (src1->elems[i1] < src2->elems[i2])
1028         {
1029           if (! REG_VALID_INDEX (--i2))
1030             break;
1031         }
1032       else
1033         {
1034           if (! REG_VALID_INDEX (--i1))
1035             break;
1036         }
1037     }
1038
1039   id = dest->nelem - 1;
1040   is = dest->nelem + src1->nelem + src2->nelem - 1;
1041   delta = is - sbase + 1;
1042
1043   /* Now copy.  When DELTA becomes zero, the remaining
1044      DEST elements are already in place; this is more or
1045      less the same loop that is in re_node_set_merge.  */
1046   dest->nelem += delta;
1047   if (delta > 0 && REG_VALID_INDEX (id))
1048     for (;;)
1049       {
1050         if (dest->elems[is] > dest->elems[id])
1051           {
1052             /* Copy from the top.  */
1053             dest->elems[id + delta--] = dest->elems[is--];
1054             if (delta == 0)
1055               break;
1056           }
1057         else
1058           {
1059             /* Slide from the bottom.  */
1060             dest->elems[id + delta] = dest->elems[id];
1061             if (! REG_VALID_INDEX (--id))
1062               break;
1063           }
1064       }
1065
1066   /* Copy remaining SRC elements.  */
1067   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1068
1069   return REG_NOERROR;
1070 }
1071
1072 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1073    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1074
1075 static reg_errcode_t
1076 internal_function
1077 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1078                         const re_node_set *src2)
1079 {
1080   Idx i1, i2, id;
1081   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1082     {
1083       dest->alloc = src1->nelem + src2->nelem;
1084       dest->elems = re_malloc (Idx, dest->alloc);
1085       if (BE (dest->elems == NULL, 0))
1086         return REG_ESPACE;
1087     }
1088   else
1089     {
1090       if (src1 != NULL && src1->nelem > 0)
1091         return re_node_set_init_copy (dest, src1);
1092       else if (src2 != NULL && src2->nelem > 0)
1093         return re_node_set_init_copy (dest, src2);
1094       else
1095         re_node_set_init_empty (dest);
1096       return REG_NOERROR;
1097     }
1098   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1099     {
1100       if (src1->elems[i1] > src2->elems[i2])
1101         {
1102           dest->elems[id++] = src2->elems[i2++];
1103           continue;
1104         }
1105       if (src1->elems[i1] == src2->elems[i2])
1106         ++i2;
1107       dest->elems[id++] = src1->elems[i1++];
1108     }
1109   if (i1 < src1->nelem)
1110     {
1111       memcpy (dest->elems + id, src1->elems + i1,
1112              (src1->nelem - i1) * sizeof (Idx));
1113       id += src1->nelem - i1;
1114     }
1115   else if (i2 < src2->nelem)
1116     {
1117       memcpy (dest->elems + id, src2->elems + i2,
1118              (src2->nelem - i2) * sizeof (Idx));
1119       id += src2->nelem - i2;
1120     }
1121   dest->nelem = id;
1122   return REG_NOERROR;
1123 }
1124
1125 /* Calculate the union set of the sets DEST and SRC. And store it to
1126    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1127
1128 static reg_errcode_t
1129 internal_function
1130 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1131 {
1132   Idx is, id, sbase, delta;
1133   if (src == NULL || src->nelem == 0)
1134     return REG_NOERROR;
1135   if (dest->alloc < 2 * src->nelem + dest->nelem)
1136     {
1137       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1138       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1139       if (BE (new_buffer == NULL, 0))
1140         return REG_ESPACE;
1141       dest->elems = new_buffer;
1142       dest->alloc = new_alloc;
1143     }
1144
1145   if (BE (dest->nelem == 0, 0))
1146     {
1147       dest->nelem = src->nelem;
1148       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1149       return REG_NOERROR;
1150     }
1151
1152   /* Copy into the top of DEST the items of SRC that are not
1153      found in DEST.  Maybe we could binary search in DEST?  */
1154   for (sbase = dest->nelem + 2 * src->nelem,
1155        is = src->nelem - 1, id = dest->nelem - 1;
1156        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1157     {
1158       if (dest->elems[id] == src->elems[is])
1159         is--, id--;
1160       else if (dest->elems[id] < src->elems[is])
1161         dest->elems[--sbase] = src->elems[is--];
1162       else /* if (dest->elems[id] > src->elems[is]) */
1163         --id;
1164     }
1165
1166   if (REG_VALID_INDEX (is))
1167     {
1168       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1169       sbase -= is + 1;
1170       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1171     }
1172
1173   id = dest->nelem - 1;
1174   is = dest->nelem + 2 * src->nelem - 1;
1175   delta = is - sbase + 1;
1176   if (delta == 0)
1177     return REG_NOERROR;
1178
1179   /* Now copy.  When DELTA becomes zero, the remaining
1180      DEST elements are already in place.  */
1181   dest->nelem += delta;
1182   for (;;)
1183     {
1184       if (dest->elems[is] > dest->elems[id])
1185         {
1186           /* Copy from the top.  */
1187           dest->elems[id + delta--] = dest->elems[is--];
1188           if (delta == 0)
1189             break;
1190         }
1191       else
1192         {
1193           /* Slide from the bottom.  */
1194           dest->elems[id + delta] = dest->elems[id];
1195           if (! REG_VALID_INDEX (--id))
1196             {
1197               /* Copy remaining SRC elements.  */
1198               memcpy (dest->elems, dest->elems + sbase,
1199                       delta * sizeof (Idx));
1200               break;
1201             }
1202         }
1203     }
1204
1205   return REG_NOERROR;
1206 }
1207
1208 /* Insert the new element ELEM to the re_node_set* SET.
1209    SET should not already have ELEM.
1210    Return true if successful.  */
1211
1212 static bool
1213 internal_function
1214 re_node_set_insert (re_node_set *set, Idx elem)
1215 {
1216   Idx idx;
1217   /* In case the set is empty.  */
1218   if (set->alloc == 0)
1219     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1220
1221   if (BE (set->nelem, 0) == 0)
1222     {
1223       /* We already guaranteed above that set->alloc != 0.  */
1224       set->elems[0] = elem;
1225       ++set->nelem;
1226       return true;
1227     }
1228
1229   /* Realloc if we need.  */
1230   if (set->alloc == set->nelem)
1231     {
1232       Idx *new_elems;
1233       set->alloc = set->alloc * 2;
1234       new_elems = re_realloc (set->elems, Idx, set->alloc);
1235       if (BE (new_elems == NULL, 0))
1236         return false;
1237       set->elems = new_elems;
1238     }
1239
1240   /* Move the elements which follows the new element.  Test the
1241      first element separately to skip a check in the inner loop.  */
1242   if (elem < set->elems[0])
1243     {
1244       idx = 0;
1245       for (idx = set->nelem; idx > 0; idx--)
1246         set->elems[idx] = set->elems[idx - 1];
1247     }
1248   else
1249     {
1250       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1251         set->elems[idx] = set->elems[idx - 1];
1252     }
1253
1254   /* Insert the new element.  */
1255   set->elems[idx] = elem;
1256   ++set->nelem;
1257   return true;
1258 }
1259
1260 /* Insert the new element ELEM to the re_node_set* SET.
1261    SET should not already have any element greater than or equal to ELEM.
1262    Return true if successful.  */
1263
1264 static bool
1265 internal_function
1266 re_node_set_insert_last (re_node_set *set, Idx elem)
1267 {
1268   /* Realloc if we need.  */
1269   if (set->alloc == set->nelem)
1270     {
1271       Idx *new_elems;
1272       set->alloc = (set->alloc + 1) * 2;
1273       new_elems = re_realloc (set->elems, Idx, set->alloc);
1274       if (BE (new_elems == NULL, 0))
1275         return false;
1276       set->elems = new_elems;
1277     }
1278
1279   /* Insert the new element.  */
1280   set->elems[set->nelem++] = elem;
1281   return true;
1282 }
1283
1284 /* Compare two node sets SET1 and SET2.
1285    Return true if SET1 and SET2 are equivalent.  */
1286
1287 static bool
1288 internal_function __attribute ((pure))
1289 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1290 {
1291   Idx i;
1292   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1293     return false;
1294   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1295     if (set1->elems[i] != set2->elems[i])
1296       return false;
1297   return true;
1298 }
1299
1300 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1301
1302 static Idx
1303 internal_function __attribute ((pure))
1304 re_node_set_contains (const re_node_set *set, Idx elem)
1305 {
1306   __re_size_t idx, right, mid;
1307   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1308     return 0;
1309
1310   /* Binary search the element.  */
1311   idx = 0;
1312   right = set->nelem - 1;
1313   while (idx < right)
1314     {
1315       mid = (idx + right) / 2;
1316       if (set->elems[mid] < elem)
1317         idx = mid + 1;
1318       else
1319         right = mid;
1320     }
1321   return set->elems[idx] == elem ? idx + 1 : 0;
1322 }
1323
1324 static void
1325 internal_function
1326 re_node_set_remove_at (re_node_set *set, Idx idx)
1327 {
1328   if (idx < 0 || idx >= set->nelem)
1329     return;
1330   --set->nelem;
1331   for (; idx < set->nelem; idx++)
1332     set->elems[idx] = set->elems[idx + 1];
1333 }
1334 \f
1335
1336 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1337    Or return REG_MISSING if an error occurred.  */
1338
1339 static Idx
1340 internal_function
1341 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1342 {
1343   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1344     {
1345       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1346       Idx *new_nexts, *new_indices;
1347       re_node_set *new_edests, *new_eclosures;
1348       re_token_t *new_nodes;
1349       size_t max_object_size =
1350         MAX (sizeof (re_token_t),
1351              MAX (sizeof (re_node_set),
1352                   sizeof (Idx)));
1353
1354       /* Avoid overflows.  */
1355       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1356         return REG_MISSING;
1357
1358       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1359       if (BE (new_nodes == NULL, 0))
1360         return REG_MISSING;
1361       dfa->nodes = new_nodes;
1362       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1363       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1364       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1365       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1366       if (BE (new_nexts == NULL || new_indices == NULL
1367               || new_edests == NULL || new_eclosures == NULL, 0))
1368         return REG_MISSING;
1369       dfa->nexts = new_nexts;
1370       dfa->org_indices = new_indices;
1371       dfa->edests = new_edests;
1372       dfa->eclosures = new_eclosures;
1373       dfa->nodes_alloc = new_nodes_alloc;
1374     }
1375   dfa->nodes[dfa->nodes_len] = token;
1376   dfa->nodes[dfa->nodes_len].constraint = 0;
1377 #ifdef RE_ENABLE_I18N
1378   {
1379   int type = token.type;
1380   dfa->nodes[dfa->nodes_len].accept_mb =
1381     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1382   }
1383 #endif
1384   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1385   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1386   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1387   return dfa->nodes_len++;
1388 }
1389
1390 static inline re_hashval_t
1391 internal_function
1392 calc_state_hash (const re_node_set *nodes, unsigned int context)
1393 {
1394   re_hashval_t hash = nodes->nelem + context;
1395   Idx i;
1396   for (i = 0 ; i < nodes->nelem ; i++)
1397     hash += nodes->elems[i];
1398   return hash;
1399 }
1400
1401 /* Search for the state whose node_set is equivalent to NODES.
1402    Return the pointer to the state, if we found it in the DFA.
1403    Otherwise create the new one and return it.  In case of an error
1404    return NULL and set the error code in ERR.
1405    Note: - We assume NULL as the invalid state, then it is possible that
1406            return value is NULL and ERR is REG_NOERROR.
1407          - We never return non-NULL value in case of any errors, it is for
1408            optimization.  */
1409
1410 static re_dfastate_t *
1411 internal_function
1412 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1413                   const re_node_set *nodes)
1414 {
1415   re_hashval_t hash;
1416   re_dfastate_t *new_state;
1417   struct re_state_table_entry *spot;
1418   Idx i;
1419 #ifdef lint
1420   /* Suppress bogus uninitialized-variable warnings.  */
1421   *err = REG_NOERROR;
1422 #endif
1423   if (BE (nodes->nelem == 0, 0))
1424     {
1425       *err = REG_NOERROR;
1426       return NULL;
1427     }
1428   hash = calc_state_hash (nodes, 0);
1429   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1430
1431   for (i = 0 ; i < spot->num ; i++)
1432     {
1433       re_dfastate_t *state = spot->array[i];
1434       if (hash != state->hash)
1435         continue;
1436       if (re_node_set_compare (&state->nodes, nodes))
1437         return state;
1438     }
1439
1440   /* There are no appropriate state in the dfa, create the new one.  */
1441   new_state = create_ci_newstate (dfa, nodes, hash);
1442   if (BE (new_state == NULL, 0))
1443     *err = REG_ESPACE;
1444
1445   return new_state;
1446 }
1447
1448 /* Search for the state whose node_set is equivalent to NODES and
1449    whose context is equivalent to CONTEXT.
1450    Return the pointer to the state, if we found it in the DFA.
1451    Otherwise create the new one and return it.  In case of an error
1452    return NULL and set the error code in ERR.
1453    Note: - We assume NULL as the invalid state, then it is possible that
1454            return value is NULL and ERR is REG_NOERROR.
1455          - We never return non-NULL value in case of any errors, it is for
1456            optimization.  */
1457
1458 static re_dfastate_t *
1459 internal_function
1460 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1461                           const re_node_set *nodes, unsigned int context)
1462 {
1463   re_hashval_t hash;
1464   re_dfastate_t *new_state;
1465   struct re_state_table_entry *spot;
1466   Idx i;
1467 #ifdef lint
1468   /* Suppress bogus uninitialized-variable warnings.  */
1469   *err = REG_NOERROR;
1470 #endif
1471   if (nodes->nelem == 0)
1472     {
1473       *err = REG_NOERROR;
1474       return NULL;
1475     }
1476   hash = calc_state_hash (nodes, context);
1477   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1478
1479   for (i = 0 ; i < spot->num ; i++)
1480     {
1481       re_dfastate_t *state = spot->array[i];
1482       if (state->hash == hash
1483           && state->context == context
1484           && re_node_set_compare (state->entrance_nodes, nodes))
1485         return state;
1486     }
1487   /* There are no appropriate state in `dfa', create the new one.  */
1488   new_state = create_cd_newstate (dfa, nodes, context, hash);
1489   if (BE (new_state == NULL, 0))
1490     *err = REG_ESPACE;
1491
1492   return new_state;
1493 }
1494
1495 /* Finish initialization of the new state NEWSTATE, and using its hash value
1496    HASH put in the appropriate bucket of DFA's state table.  Return value
1497    indicates the error code if failed.  */
1498
1499 static reg_errcode_t
1500 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1501                 re_hashval_t hash)
1502 {
1503   struct re_state_table_entry *spot;
1504   reg_errcode_t err;
1505   Idx i;
1506
1507   newstate->hash = hash;
1508   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1509   if (BE (err != REG_NOERROR, 0))
1510     return REG_ESPACE;
1511   for (i = 0; i < newstate->nodes.nelem; i++)
1512     {
1513       Idx elem = newstate->nodes.elems[i];
1514       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1515         if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1516           return REG_ESPACE;
1517     }
1518
1519   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1520   if (BE (spot->alloc <= spot->num, 0))
1521     {
1522       Idx new_alloc = 2 * spot->num + 2;
1523       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1524                                               new_alloc);
1525       if (BE (new_array == NULL, 0))
1526         return REG_ESPACE;
1527       spot->array = new_array;
1528       spot->alloc = new_alloc;
1529     }
1530   spot->array[spot->num++] = newstate;
1531   return REG_NOERROR;
1532 }
1533
1534 static void
1535 free_state (re_dfastate_t *state)
1536 {
1537   re_node_set_free (&state->non_eps_nodes);
1538   re_node_set_free (&state->inveclosure);
1539   if (state->entrance_nodes != &state->nodes)
1540     {
1541       re_node_set_free (state->entrance_nodes);
1542       re_free (state->entrance_nodes);
1543     }
1544   re_node_set_free (&state->nodes);
1545   re_free (state->word_trtable);
1546   re_free (state->trtable);
1547   re_free (state);
1548 }
1549
1550 /* Create the new state which is independ of contexts.
1551    Return the new state if succeeded, otherwise return NULL.  */
1552
1553 static re_dfastate_t *
1554 internal_function
1555 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1556                     re_hashval_t hash)
1557 {
1558   Idx i;
1559   reg_errcode_t err;
1560   re_dfastate_t *newstate;
1561
1562   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1563   if (BE (newstate == NULL, 0))
1564     return NULL;
1565   err = re_node_set_init_copy (&newstate->nodes, nodes);
1566   if (BE (err != REG_NOERROR, 0))
1567     {
1568       re_free (newstate);
1569       return NULL;
1570     }
1571
1572   newstate->entrance_nodes = &newstate->nodes;
1573   for (i = 0 ; i < nodes->nelem ; i++)
1574     {
1575       re_token_t *node = dfa->nodes + nodes->elems[i];
1576       re_token_type_t type = node->type;
1577       if (type == CHARACTER && !node->constraint)
1578         continue;
1579 #ifdef RE_ENABLE_I18N
1580       newstate->accept_mb |= node->accept_mb;
1581 #endif /* RE_ENABLE_I18N */
1582
1583       /* If the state has the halt node, the state is a halt state.  */
1584       if (type == END_OF_RE)
1585         newstate->halt = 1;
1586       else if (type == OP_BACK_REF)
1587         newstate->has_backref = 1;
1588       else if (type == ANCHOR || node->constraint)
1589         newstate->has_constraint = 1;
1590     }
1591   err = register_state (dfa, newstate, hash);
1592   if (BE (err != REG_NOERROR, 0))
1593     {
1594       free_state (newstate);
1595       newstate = NULL;
1596     }
1597   return newstate;
1598 }
1599
1600 /* Create the new state which is depend on the context CONTEXT.
1601    Return the new state if succeeded, otherwise return NULL.  */
1602
1603 static re_dfastate_t *
1604 internal_function
1605 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1606                     unsigned int context, re_hashval_t hash)
1607 {
1608   Idx i, nctx_nodes = 0;
1609   reg_errcode_t err;
1610   re_dfastate_t *newstate;
1611
1612   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1613   if (BE (newstate == NULL, 0))
1614     return NULL;
1615   err = re_node_set_init_copy (&newstate->nodes, nodes);
1616   if (BE (err != REG_NOERROR, 0))
1617     {
1618       re_free (newstate);
1619       return NULL;
1620     }
1621
1622   newstate->context = context;
1623   newstate->entrance_nodes = &newstate->nodes;
1624
1625   for (i = 0 ; i < nodes->nelem ; i++)
1626     {
1627       unsigned int constraint = 0;
1628       re_token_t *node = dfa->nodes + nodes->elems[i];
1629       re_token_type_t type = node->type;
1630       if (node->constraint)
1631         constraint = node->constraint;
1632
1633       if (type == CHARACTER && !constraint)
1634         continue;
1635 #ifdef RE_ENABLE_I18N
1636       newstate->accept_mb |= node->accept_mb;
1637 #endif /* RE_ENABLE_I18N */
1638
1639       /* If the state has the halt node, the state is a halt state.  */
1640       if (type == END_OF_RE)
1641         newstate->halt = 1;
1642       else if (type == OP_BACK_REF)
1643         newstate->has_backref = 1;
1644       else if (type == ANCHOR)
1645         constraint = node->opr.ctx_type;
1646
1647       if (constraint)
1648         {
1649           if (newstate->entrance_nodes == &newstate->nodes)
1650             {
1651               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1652               if (BE (newstate->entrance_nodes == NULL, 0))
1653                 {
1654                   free_state (newstate);
1655                   return NULL;
1656                 }
1657               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1658               nctx_nodes = 0;
1659               newstate->has_constraint = 1;
1660             }
1661
1662           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1663             {
1664               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1665               ++nctx_nodes;
1666             }
1667         }
1668     }
1669   err = register_state (dfa, newstate, hash);
1670   if (BE (err != REG_NOERROR, 0))
1671     {
1672       free_state (newstate);
1673       newstate = NULL;
1674     }
1675   return  newstate;
1676 }