lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash) internal_function;
  31 \f
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 internal_function
  39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  40                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   Idx init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 internal_function
  67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  68                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif /* RE_ENABLE_I18N  */
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 internal_function
 130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs;
 136
 137       /* Avoid overflow.  */
 138       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 139       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 140         return REG_ESPACE;
 141
 142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 143       if (BE (new_wcs == NULL, 0))
 144         return REG_ESPACE;
 145       pstr->wcs = new_wcs;
 146       if (pstr->offsets != NULL)
 147         {
 148           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 149           if (BE (new_offsets == NULL, 0))
 150             return REG_ESPACE;
 151           pstr->offsets = new_offsets;
 152         }
 153     }
 154 #endif /* RE_ENABLE_I18N  */
 155   if (pstr->mbs_allocated)
 156     {
 157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 158                                            new_buf_len);
 159       if (BE (new_mbs == NULL, 0))
 160         return REG_ESPACE;
 161       pstr->mbs = new_mbs;
 162     }
 163   pstr->bufs_len = new_buf_len;
 164   return REG_NOERROR;
 165 }
 166
 167
 168 static void
 169 internal_function
 170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 171                             RE_TRANSLATE_TYPE trans, bool icase,
 172                             const re_dfa_t *dfa)
 173 {
 174   pstr->raw_mbs = (const unsigned char *) str;
 175   pstr->len = len;
 176   pstr->raw_len = len;
 177   pstr->trans = trans;
 178   pstr->icase = icase;
 179   pstr->mbs_allocated = (trans != NULL || icase);
 180   pstr->mb_cur_max = dfa->mb_cur_max;
 181   pstr->is_utf8 = dfa->is_utf8;
 182   pstr->map_notascii = dfa->map_notascii;
 183   pstr->stop = pstr->len;
 184   pstr->raw_stop = pstr->stop;
 185 }
 186
 187 #ifdef RE_ENABLE_I18N
 188
 189 /* Build wide character buffer PSTR->WCS.
 190    If the byte sequence of the string are:
 191      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 192    Then wide character buffer will be:
 193      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 194    We use WEOF for padding, they indicate that the position isn't
 195    a first byte of a multibyte character.
 196
 197    Note that this function assumes PSTR->VALID_LEN elements are already
 198    built and starts from PSTR->VALID_LEN.  */
 199
 200 static void
 201 internal_function
 202 build_wcs_buffer (re_string_t *pstr)
 203 {
 204 #ifdef _LIBC
 205   unsigned char buf[MB_LEN_MAX];
 206   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 207 #else
 208   unsigned char buf[64];
 209 #endif
 210   mbstate_t prev_st;
 211   Idx byte_idx, end_idx, remain_len;
 212   size_t mbclen;
 213
 214   /* Build the buffers from pstr->valid_len to either pstr->len or
 215      pstr->bufs_len.  */
 216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 218     {
 219       wchar_t wc;
 220       const char *p;
 221
 222       remain_len = end_idx - byte_idx;
 223       prev_st = pstr->cur_state;
 224       /* Apply the translation if we need.  */
 225       if (BE (pstr->trans != NULL, 0))
 226         {
 227           int i, ch;
 228
 229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 230             {
 231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 233             }
 234           p = (const char *) buf;
 235         }
 236       else
 237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 238       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 239       if (BE (mbclen == (size_t) -2, 0))
 240         {
 241           /* The buffer doesn't have enough space, finish to build.  */
 242           pstr->cur_state = prev_st;
 243           break;
 244         }
 245       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 246         {
 247           /* We treat these cases as a singlebyte character.  */
 248           mbclen = 1;
 249           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 250           if (BE (pstr->trans != NULL, 0))
 251             wc = pstr->trans[wc];
 252           pstr->cur_state = prev_st;
 253         }
 254
 255       /* Write wide character and padding.  */
 256       pstr->wcs[byte_idx++] = wc;
 257       /* Write paddings.  */
 258       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 259         pstr->wcs[byte_idx++] = WEOF;
 260     }
 261   pstr->valid_len = byte_idx;
 262   pstr->valid_raw_len = byte_idx;
 263 }
 264
 265 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 266    but for REG_ICASE.  */
 267
 268 static reg_errcode_t
 269 internal_function
 270 build_wcs_upper_buffer (re_string_t *pstr)
 271 {
 272   mbstate_t prev_st;
 273   Idx src_idx, byte_idx, end_idx, remain_len;
 274   size_t mbclen;
 275 #ifdef _LIBC
 276   char buf[MB_LEN_MAX];
 277   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 278 #else
 279   char buf[64];
 280 #endif
 281
 282   byte_idx = pstr->valid_len;
 283   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 284
 285   /* The following optimization assumes that ASCII characters can be
 286      mapped to wide characters with a simple cast.  */
 287   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 288     {
 289       while (byte_idx < end_idx)
 290         {
 291           wchar_t wc;
 292
 293           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 294               && mbsinit (&pstr->cur_state))
 295             {
 296               /* In case of a singlebyte character.  */
 297               pstr->mbs[byte_idx]
 298                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 299               /* The next step uses the assumption that wchar_t is encoded
 300                  ASCII-safe: all ASCII values can be converted like this.  */
 301               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 302               ++byte_idx;
 303               continue;
 304             }
 305
 306           remain_len = end_idx - byte_idx;
 307           prev_st = pstr->cur_state;
 308           mbclen = mbrtowc (&wc,
 309                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 310                              + byte_idx), remain_len, &pstr->cur_state);
 311           if (BE (mbclen < (size_t) -2, 1))
 312             {
 313               wchar_t wcu = wc;
 314               if (iswlower (wc))
 315                 {
 316                   size_t mbcdlen;
 317
 318                   wcu = towupper (wc);
 319                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 320                   if (BE (mbclen == mbcdlen, 1))
 321                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 322                   else
 323                     {
 324                       src_idx = byte_idx;
 325                       goto offsets_needed;
 326                     }
 327                 }
 328               else
 329                 memcpy (pstr->mbs + byte_idx,
 330                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 331               pstr->wcs[byte_idx++] = wcu;
 332               /* Write paddings.  */
 333               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 334                 pstr->wcs[byte_idx++] = WEOF;
 335             }
 336           else if (mbclen == (size_t) -1 || mbclen == 0)
 337             {
 338               /* It is an invalid character or '\0'.  Just use the byte.  */
 339               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 340               pstr->mbs[byte_idx] = ch;
 341               /* And also cast it to wide char.  */
 342               pstr->wcs[byte_idx++] = (wchar_t) ch;
 343               if (BE (mbclen == (size_t) -1, 0))
 344                 pstr->cur_state = prev_st;
 345             }
 346           else
 347             {
 348               /* The buffer doesn't have enough space, finish to build.  */
 349               pstr->cur_state = prev_st;
 350               break;
 351             }
 352         }
 353       pstr->valid_len = byte_idx;
 354       pstr->valid_raw_len = byte_idx;
 355       return REG_NOERROR;
 356     }
 357   else
 358     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 359       {
 360         wchar_t wc;
 361         const char *p;
 362       offsets_needed:
 363         remain_len = end_idx - byte_idx;
 364         prev_st = pstr->cur_state;
 365         if (BE (pstr->trans != NULL, 0))
 366           {
 367             int i, ch;
 368
 369             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 370               {
 371                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 372                 buf[i] = pstr->trans[ch];
 373               }
 374             p = (const char *) buf;
 375           }
 376         else
 377           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 378         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 379         if (BE (mbclen < (size_t) -2, 1))
 380           {
 381             wchar_t wcu = wc;
 382             if (iswlower (wc))
 383               {
 384                 size_t mbcdlen;
 385
 386                 wcu = towupper (wc);
 387                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 388                 if (BE (mbclen == mbcdlen, 1))
 389                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 390                 else if (mbcdlen != (size_t) -1)
 391                   {
 392                     size_t i;
 393
 394                     if (byte_idx + mbcdlen > pstr->bufs_len)
 395                       {
 396                         pstr->cur_state = prev_st;
 397                         break;
 398                       }
 399
 400                     if (pstr->offsets == NULL)
 401                       {
 402                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 403
 404                         if (pstr->offsets == NULL)
 405                           return REG_ESPACE;
 406                       }
 407                     if (!pstr->offsets_needed)
 408                       {
 409                         for (i = 0; i < (size_t) byte_idx; ++i)
 410                           pstr->offsets[i] = i;
 411                         pstr->offsets_needed = 1;
 412                       }
 413
 414                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 415                     pstr->wcs[byte_idx] = wcu;
 416                     pstr->offsets[byte_idx] = src_idx;
 417                     for (i = 1; i < mbcdlen; ++i)
 418                       {
 419                         pstr->offsets[byte_idx + i]
 420                           = src_idx + (i < mbclen ? i : mbclen - 1);
 421                         pstr->wcs[byte_idx + i] = WEOF;
 422                       }
 423                     pstr->len += mbcdlen - mbclen;
 424                     if (pstr->raw_stop > src_idx)
 425                       pstr->stop += mbcdlen - mbclen;
 426                     end_idx = (pstr->bufs_len > pstr->len)
 427                               ? pstr->len : pstr->bufs_len;
 428                     byte_idx += mbcdlen;
 429                     src_idx += mbclen;
 430                     continue;
 431                   }
 432                 else
 433                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 434               }
 435             else
 436               memcpy (pstr->mbs + byte_idx, p, mbclen);
 437
 438             if (BE (pstr->offsets_needed != 0, 0))
 439               {
 440                 size_t i;
 441                 for (i = 0; i < mbclen; ++i)
 442                   pstr->offsets[byte_idx + i] = src_idx + i;
 443               }
 444             src_idx += mbclen;
 445
 446             pstr->wcs[byte_idx++] = wcu;
 447             /* Write paddings.  */
 448             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 449               pstr->wcs[byte_idx++] = WEOF;
 450           }
 451         else if (mbclen == (size_t) -1 || mbclen == 0)
 452           {
 453             /* It is an invalid character or '\0'.  Just use the byte.  */
 454             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 455
 456             if (BE (pstr->trans != NULL, 0))
 457               ch = pstr->trans [ch];
 458             pstr->mbs[byte_idx] = ch;
 459
 460             if (BE (pstr->offsets_needed != 0, 0))
 461               pstr->offsets[byte_idx] = src_idx;
 462             ++src_idx;
 463
 464             /* And also cast it to wide char.  */
 465             pstr->wcs[byte_idx++] = (wchar_t) ch;
 466             if (BE (mbclen == (size_t) -1, 0))
 467               pstr->cur_state = prev_st;
 468           }
 469         else
 470           {
 471             /* The buffer doesn't have enough space, finish to build.  */
 472             pstr->cur_state = prev_st;
 473             break;
 474           }
 475       }
 476   pstr->valid_len = byte_idx;
 477   pstr->valid_raw_len = src_idx;
 478   return REG_NOERROR;
 479 }
 480
 481 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 482    Return the index.  */
 483
 484 static Idx
 485 internal_function
 486 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 487 {
 488   mbstate_t prev_st;
 489   Idx rawbuf_idx;
 490   size_t mbclen;
 491   wchar_t wc = 0;
 492
 493   /* Skip the characters which are not necessary to check.  */
 494   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 495        rawbuf_idx < new_raw_idx;)
 496     {
 497       Idx remain_len;
 498       remain_len = pstr->len - rawbuf_idx;
 499       prev_st = pstr->cur_state;
 500       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 501                         remain_len, &pstr->cur_state);
 502       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 503         {
 504           /* We treat these cases as a singlebyte character.  */
 505           mbclen = 1;
 506           pstr->cur_state = prev_st;
 507         }
 508       /* Then proceed the next character.  */
 509       rawbuf_idx += mbclen;
 510     }
 511   *last_wc = (wint_t) wc;
 512   return rawbuf_idx;
 513 }
 514 #endif /* RE_ENABLE_I18N  */
 515
 516 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 517    This function is used in case of REG_ICASE.  */
 518
 519 static void
 520 internal_function
 521 build_upper_buffer (re_string_t *pstr)
 522 {
 523   Idx char_idx, end_idx;
 524   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 525
 526   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 527     {
 528       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 529       if (BE (pstr->trans != NULL, 0))
 530         ch = pstr->trans[ch];
 531       if (islower (ch))
 532         pstr->mbs[char_idx] = toupper (ch);
 533       else
 534         pstr->mbs[char_idx] = ch;
 535     }
 536   pstr->valid_len = char_idx;
 537   pstr->valid_raw_len = char_idx;
 538 }
 539
 540 /* Apply TRANS to the buffer in PSTR.  */
 541
 542 static void
 543 internal_function
 544 re_string_translate_buffer (re_string_t *pstr)
 545 {
 546   Idx buf_idx, end_idx;
 547   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 548
 549   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 550     {
 551       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 552       pstr->mbs[buf_idx] = pstr->trans[ch];
 553     }
 554
 555   pstr->valid_len = buf_idx;
 556   pstr->valid_raw_len = buf_idx;
 557 }
 558
 559 /* This function re-construct the buffers.
 560    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 561    convert to upper case in case of REG_ICASE, apply translation.  */
 562
 563 static reg_errcode_t
 564 internal_function
 565 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 566 {
 567   Idx offset;
 568
 569   if (BE (pstr->raw_mbs_idx <= idx, 0))
 570     offset = idx - pstr->raw_mbs_idx;
 571   else
 572     {
 573       /* Reset buffer.  */
 574 #ifdef RE_ENABLE_I18N
 575       if (pstr->mb_cur_max > 1)
 576         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 577 #endif /* RE_ENABLE_I18N */
 578       pstr->len = pstr->raw_len;
 579       pstr->stop = pstr->raw_stop;
 580       pstr->valid_len = 0;
 581       pstr->raw_mbs_idx = 0;
 582       pstr->valid_raw_len = 0;
 583       pstr->offsets_needed = 0;
 584       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 585                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 586       if (!pstr->mbs_allocated)
 587         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 588       offset = idx;
 589     }
 590
 591   if (BE (offset != 0, 1))
 592     {
 593       /* Are the characters which are already checked remain?  */
 594       if (BE (offset < pstr->valid_raw_len, 1)
 595 #ifdef RE_ENABLE_I18N
 596           /* Handling this would enlarge the code too much.
 597              Accept a slowdown in that case.  */
 598           && pstr->offsets_needed == 0
 599 #endif
 600          )
 601         {
 602           /* Yes, move them to the front of the buffer.  */
 603           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 604 #ifdef RE_ENABLE_I18N
 605           if (pstr->mb_cur_max > 1)
 606             memmove (pstr->wcs, pstr->wcs + offset,
 607                      (pstr->valid_len - offset) * sizeof (wint_t));
 608 #endif /* RE_ENABLE_I18N */
 609           if (BE (pstr->mbs_allocated, 0))
 610             memmove (pstr->mbs, pstr->mbs + offset,
 611                      pstr->valid_len - offset);
 612           pstr->valid_len -= offset;
 613           pstr->valid_raw_len -= offset;
 614 #if DEBUG
 615           assert (pstr->valid_len > 0);
 616 #endif
 617         }
 618       else
 619         {
 620           /* No, skip all characters until IDX.  */
 621 #ifdef RE_ENABLE_I18N
 622           if (BE (pstr->offsets_needed, 0))
 623             {
 624               pstr->len = pstr->raw_len - idx + offset;
 625               pstr->stop = pstr->raw_stop - idx + offset;
 626               pstr->offsets_needed = 0;
 627             }
 628 #endif
 629           pstr->valid_len = 0;
 630           pstr->valid_raw_len = 0;
 631 #ifdef RE_ENABLE_I18N
 632           if (pstr->mb_cur_max > 1)
 633             {
 634               Idx wcs_idx;
 635               wint_t wc = WEOF;
 636
 637               if (pstr->is_utf8)
 638                 {
 639                   const unsigned char *raw, *p, *q, *end;
 640
 641                   /* Special case UTF-8.  Multi-byte chars start with any
 642                      byte other than 0x80 - 0xbf.  */
 643                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 644                   end = raw + (offset - pstr->mb_cur_max);
 645                   p = raw + offset - 1;
 646 #ifdef _LIBC
 647                   /* We know the wchar_t encoding is UCS4, so for the simple
 648                      case, ASCII characters, skip the conversion step.  */
 649                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 650                     {
 651                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 652                       pstr->valid_len = 0;
 653                       wc = (wchar_t) *p;
 654                     }
 655                   else
 656 #endif
 657                     for (; p >= end; --p)
 658                       if ((*p & 0xc0) != 0x80)
 659                         {
 660                           mbstate_t cur_state;
 661                           wchar_t wc2;
 662                           Idx mlen = raw + pstr->len - p;
 663                           unsigned char buf[6];
 664                           size_t mbclen;
 665
 666                           q = p;
 667                           if (BE (pstr->trans != NULL, 0))
 668                             {
 669                               int i = mlen < 6 ? mlen : 6;
 670                               while (--i >= 0)
 671                                 buf[i] = pstr->trans[p[i]];
 672                               q = buf;
 673                             }
 674                           /* XXX Don't use mbrtowc, we know which conversion
 675                              to use (UTF-8 -> UCS4).  */
 676                           memset (&cur_state, 0, sizeof (cur_state));
 677                           mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 678                                             &cur_state);
 679                           if (raw + offset - p <= mbclen
 680                               && mbclen < (size_t) -2)
 681                             {
 682                               memset (&pstr->cur_state, '\0',
 683                                       sizeof (mbstate_t));
 684                               pstr->valid_len = mbclen - (raw + offset - p);
 685                               wc = wc2;
 686                             }
 687                           break;
 688                         }
 689                 }
 690
 691               if (wc == WEOF)
 692                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 693               if (BE (pstr->valid_len, 0))
 694                 {
 695                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 696                     pstr->wcs[wcs_idx] = WEOF;
 697                   if (pstr->mbs_allocated)
 698                     memset (pstr->mbs, -1, pstr->valid_len);
 699                 }
 700               pstr->valid_raw_len = pstr->valid_len;
 701               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 702                                     && IS_WIDE_WORD_CHAR (wc))
 703                                    ? CONTEXT_WORD
 704                                    : ((IS_WIDE_NEWLINE (wc)
 705                                        && pstr->newline_anchor)
 706                                       ? CONTEXT_NEWLINE : 0));
 707             }
 708           else
 709 #endif /* RE_ENABLE_I18N */
 710             {
 711               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 712               if (pstr->trans)
 713                 c = pstr->trans[c];
 714               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 715                                    ? CONTEXT_WORD
 716                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 717                                       ? CONTEXT_NEWLINE : 0));
 718             }
 719         }
 720       if (!BE (pstr->mbs_allocated, 0))
 721         pstr->mbs += offset;
 722     }
 723   pstr->raw_mbs_idx = idx;
 724   pstr->len -= offset;
 725   pstr->stop -= offset;
 726
 727   /* Then build the buffers.  */
 728 #ifdef RE_ENABLE_I18N
 729   if (pstr->mb_cur_max > 1)
 730     {
 731       if (pstr->icase)
 732         {
 733           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 734           if (BE (ret != REG_NOERROR, 0))
 735             return ret;
 736         }
 737       else
 738         build_wcs_buffer (pstr);
 739     }
 740   else
 741 #endif /* RE_ENABLE_I18N */
 742     if (BE (pstr->mbs_allocated, 0))
 743       {
 744         if (pstr->icase)
 745           build_upper_buffer (pstr);
 746         else if (pstr->trans != NULL)
 747           re_string_translate_buffer (pstr);
 748       }
 749     else
 750       pstr->valid_len = pstr->len;
 751
 752   pstr->cur_idx = 0;
 753   return REG_NOERROR;
 754 }
 755
 756 static unsigned char
 757 internal_function __attribute ((pure))
 758 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 759 {
 760   int ch;
 761   Idx off;
 762
 763   /* Handle the common (easiest) cases first.  */
 764   if (BE (!pstr->mbs_allocated, 1))
 765     return re_string_peek_byte (pstr, idx);
 766
 767 #ifdef RE_ENABLE_I18N
 768   if (pstr->mb_cur_max > 1
 769       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 770     return re_string_peek_byte (pstr, idx);
 771 #endif
 772
 773   off = pstr->cur_idx + idx;
 774 #ifdef RE_ENABLE_I18N
 775   if (pstr->offsets_needed)
 776     off = pstr->offsets[off];
 777 #endif
 778
 779   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 780
 781 #ifdef RE_ENABLE_I18N
 782   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 783      this function returns CAPITAL LETTER I instead of first byte of
 784      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 785      since peek_byte_case doesn't advance cur_idx in any way.  */
 786   if (pstr->offsets_needed && !isascii (ch))
 787     return re_string_peek_byte (pstr, idx);
 788 #endif
 789
 790   return ch;
 791 }
 792
 793 static unsigned char
 794 internal_function __attribute ((pure))
 795 re_string_fetch_byte_case (re_string_t *pstr)
 796 {
 797   if (BE (!pstr->mbs_allocated, 1))
 798     return re_string_fetch_byte (pstr);
 799
 800 #ifdef RE_ENABLE_I18N
 801   if (pstr->offsets_needed)
 802     {
 803       Idx off;
 804       int ch;
 805
 806       /* For tr_TR.UTF-8 [[:islower:]] there is
 807          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 808          in that case the whole multi-byte character and return
 809          the original letter.  On the other side, with
 810          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 811          anything else would complicate things too much.  */
 812
 813       if (!re_string_first_byte (pstr, pstr->cur_idx))
 814         return re_string_fetch_byte (pstr);
 815
 816       off = pstr->offsets[pstr->cur_idx];
 817       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 818
 819       if (! isascii (ch))
 820         return re_string_fetch_byte (pstr);
 821
 822       re_string_skip_bytes (pstr,
 823                             re_string_char_size_at (pstr, pstr->cur_idx));
 824       return ch;
 825     }
 826 #endif
 827
 828   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 829 }
 830
 831 static void
 832 internal_function
 833 re_string_destruct (re_string_t *pstr)
 834 {
 835 #ifdef RE_ENABLE_I18N
 836   re_free (pstr->wcs);
 837   re_free (pstr->offsets);
 838 #endif /* RE_ENABLE_I18N  */
 839   if (pstr->mbs_allocated)
 840     re_free (pstr->mbs);
 841 }
 842
 843 /* Return the context at IDX in INPUT.  */
 844
 845 static unsigned int
 846 internal_function
 847 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 848 {
 849   int c;
 850   if (BE (! REG_VALID_INDEX (idx), 0))
 851     /* In this case, we use the value stored in input->tip_context,
 852        since we can't know the character in input->mbs[-1] here.  */
 853     return input->tip_context;
 854   if (BE (idx == input->len, 0))
 855     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 856             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 857 #ifdef RE_ENABLE_I18N
 858   if (input->mb_cur_max > 1)
 859     {
 860       wint_t wc;
 861       Idx wc_idx = idx;
 862       while(input->wcs[wc_idx] == WEOF)
 863         {
 864 #ifdef DEBUG
 865           /* It must not happen.  */
 866           assert (REG_VALID_INDEX (wc_idx));
 867 #endif
 868           --wc_idx;
 869           if (! REG_VALID_INDEX (wc_idx))
 870             return input->tip_context;
 871         }
 872       wc = input->wcs[wc_idx];
 873       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 874         return CONTEXT_WORD;
 875       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 876               ? CONTEXT_NEWLINE : 0);
 877     }
 878   else
 879 #endif
 880     {
 881       c = re_string_byte_at (input, idx);
 882       if (bitset_contain (input->word_char, c))
 883         return CONTEXT_WORD;
 884       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 885     }
 886 }
 887 \f
 888 /* Functions for set operation.  */
 889
 890 static reg_errcode_t
 891 internal_function
 892 re_node_set_alloc (re_node_set *set, Idx size)
 893 {
 894   set->alloc = size;
 895   set->nelem = 0;
 896   set->elems = re_malloc (Idx, size);
 897   if (BE (set->elems == NULL, 0))
 898     return REG_ESPACE;
 899   return REG_NOERROR;
 900 }
 901
 902 static reg_errcode_t
 903 internal_function
 904 re_node_set_init_1 (re_node_set *set, Idx elem)
 905 {
 906   set->alloc = 1;
 907   set->nelem = 1;
 908   set->elems = re_malloc (Idx, 1);
 909   if (BE (set->elems == NULL, 0))
 910     {
 911       set->alloc = set->nelem = 0;
 912       return REG_ESPACE;
 913     }
 914   set->elems[0] = elem;
 915   return REG_NOERROR;
 916 }
 917
 918 static reg_errcode_t
 919 internal_function
 920 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 921 {
 922   set->alloc = 2;
 923   set->elems = re_malloc (Idx, 2);
 924   if (BE (set->elems == NULL, 0))
 925     return REG_ESPACE;
 926   if (elem1 == elem2)
 927     {
 928       set->nelem = 1;
 929       set->elems[0] = elem1;
 930     }
 931   else
 932     {
 933       set->nelem = 2;
 934       if (elem1 < elem2)
 935         {
 936           set->elems[0] = elem1;
 937           set->elems[1] = elem2;
 938         }
 939       else
 940         {
 941           set->elems[0] = elem2;
 942           set->elems[1] = elem1;
 943         }
 944     }
 945   return REG_NOERROR;
 946 }
 947
 948 static reg_errcode_t
 949 internal_function
 950 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 951 {
 952   dest->nelem = src->nelem;
 953   if (src->nelem > 0)
 954     {
 955       dest->alloc = dest->nelem;
 956       dest->elems = re_malloc (Idx, dest->alloc);
 957       if (BE (dest->elems == NULL, 0))
 958         {
 959           dest->alloc = dest->nelem = 0;
 960           return REG_ESPACE;
 961         }
 962       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
 963     }
 964   else
 965     re_node_set_init_empty (dest);
 966   return REG_NOERROR;
 967 }
 968
 969 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 970    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 971    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 972
 973 static reg_errcode_t
 974 internal_function
 975 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 976                            const re_node_set *src2)
 977 {
 978   Idx i1, i2, is, id, delta, sbase;
 979   if (src1->nelem == 0 || src2->nelem == 0)
 980     return REG_NOERROR;
 981
 982   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 983      conservative estimate.  */
 984   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 985     {
 986       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
 987       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
 988       if (BE (new_elems == NULL, 0))
 989         return REG_ESPACE;
 990       dest->elems = new_elems;
 991       dest->alloc = new_alloc;
 992     }
 993
 994   /* Find the items in the intersection of SRC1 and SRC2, and copy
 995      into the top of DEST those that are not already in DEST itself.  */
 996   sbase = dest->nelem + src1->nelem + src2->nelem;
 997   i1 = src1->nelem - 1;
 998   i2 = src2->nelem - 1;
 999   id = dest->nelem - 1;
1000   for (;;)
1001     {
1002       if (src1->elems[i1] == src2->elems[i2])
1003         {
1004           /* Try to find the item in DEST.  Maybe we could binary search?  */
1005           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1006             --id;
1007
1008           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1009             dest->elems[--sbase] = src1->elems[i1];
1010
1011           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1012             break;
1013         }
1014
1015       /* Lower the highest of the two items.  */
1016       else if (src1->elems[i1] < src2->elems[i2])
1017         {
1018           if (! REG_VALID_INDEX (--i2))
1019             break;
1020         }
1021       else
1022         {
1023           if (! REG_VALID_INDEX (--i1))
1024             break;
1025         }
1026     }
1027
1028   id = dest->nelem - 1;
1029   is = dest->nelem + src1->nelem + src2->nelem - 1;
1030   delta = is - sbase + 1;
1031
1032   /* Now copy.  When DELTA becomes zero, the remaining
1033      DEST elements are already in place; this is more or
1034      less the same loop that is in re_node_set_merge.  */
1035   dest->nelem += delta;
1036   if (delta > 0 && REG_VALID_INDEX (id))
1037     for (;;)
1038       {
1039         if (dest->elems[is] > dest->elems[id])
1040           {
1041             /* Copy from the top.  */
1042             dest->elems[id + delta--] = dest->elems[is--];
1043             if (delta == 0)
1044               break;
1045           }
1046         else
1047           {
1048             /* Slide from the bottom.  */
1049             dest->elems[id + delta] = dest->elems[id];
1050             if (! REG_VALID_INDEX (--id))
1051               break;
1052           }
1053       }
1054
1055   /* Copy remaining SRC elements.  */
1056   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1057
1058   return REG_NOERROR;
1059 }
1060
1061 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1062    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1063
1064 static reg_errcode_t
1065 internal_function
1066 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1067                         const re_node_set *src2)
1068 {
1069   Idx i1, i2, id;
1070   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1071     {
1072       dest->alloc = src1->nelem + src2->nelem;
1073       dest->elems = re_malloc (Idx, dest->alloc);
1074       if (BE (dest->elems == NULL, 0))
1075         return REG_ESPACE;
1076     }
1077   else
1078     {
1079       if (src1 != NULL && src1->nelem > 0)
1080         return re_node_set_init_copy (dest, src1);
1081       else if (src2 != NULL && src2->nelem > 0)
1082         return re_node_set_init_copy (dest, src2);
1083       else
1084         re_node_set_init_empty (dest);
1085       return REG_NOERROR;
1086     }
1087   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1088     {
1089       if (src1->elems[i1] > src2->elems[i2])
1090         {
1091           dest->elems[id++] = src2->elems[i2++];
1092           continue;
1093         }
1094       if (src1->elems[i1] == src2->elems[i2])
1095         ++i2;
1096       dest->elems[id++] = src1->elems[i1++];
1097     }
1098   if (i1 < src1->nelem)
1099     {
1100       memcpy (dest->elems + id, src1->elems + i1,
1101              (src1->nelem - i1) * sizeof (Idx));
1102       id += src1->nelem - i1;
1103     }
1104   else if (i2 < src2->nelem)
1105     {
1106       memcpy (dest->elems + id, src2->elems + i2,
1107              (src2->nelem - i2) * sizeof (Idx));
1108       id += src2->nelem - i2;
1109     }
1110   dest->nelem = id;
1111   return REG_NOERROR;
1112 }
1113
1114 /* Calculate the union set of the sets DEST and SRC. And store it to
1115    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1116
1117 static reg_errcode_t
1118 internal_function
1119 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1120 {
1121   Idx is, id, sbase, delta;
1122   if (src == NULL || src->nelem == 0)
1123     return REG_NOERROR;
1124   if (dest->alloc < 2 * src->nelem + dest->nelem)
1125     {
1126       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1127       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1128       if (BE (new_buffer == NULL, 0))
1129         return REG_ESPACE;
1130       dest->elems = new_buffer;
1131       dest->alloc = new_alloc;
1132     }
1133
1134   if (BE (dest->nelem == 0, 0))
1135     {
1136       dest->nelem = src->nelem;
1137       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1138       return REG_NOERROR;
1139     }
1140
1141   /* Copy into the top of DEST the items of SRC that are not
1142      found in DEST.  Maybe we could binary search in DEST?  */
1143   for (sbase = dest->nelem + 2 * src->nelem,
1144        is = src->nelem - 1, id = dest->nelem - 1;
1145        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1146     {
1147       if (dest->elems[id] == src->elems[is])
1148         is--, id--;
1149       else if (dest->elems[id] < src->elems[is])
1150         dest->elems[--sbase] = src->elems[is--];
1151       else /* if (dest->elems[id] > src->elems[is]) */
1152         --id;
1153     }
1154
1155   if (REG_VALID_INDEX (is))
1156     {
1157       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1158       sbase -= is + 1;
1159       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1160     }
1161
1162   id = dest->nelem - 1;
1163   is = dest->nelem + 2 * src->nelem - 1;
1164   delta = is - sbase + 1;
1165   if (delta == 0)
1166     return REG_NOERROR;
1167
1168   /* Now copy.  When DELTA becomes zero, the remaining
1169      DEST elements are already in place.  */
1170   dest->nelem += delta;
1171   for (;;)
1172     {
1173       if (dest->elems[is] > dest->elems[id])
1174         {
1175           /* Copy from the top.  */
1176           dest->elems[id + delta--] = dest->elems[is--];
1177           if (delta == 0)
1178             break;
1179         }
1180       else
1181         {
1182           /* Slide from the bottom.  */
1183           dest->elems[id + delta] = dest->elems[id];
1184           if (! REG_VALID_INDEX (--id))
1185             {
1186               /* Copy remaining SRC elements.  */
1187               memcpy (dest->elems, dest->elems + sbase,
1188                       delta * sizeof (Idx));
1189               break;
1190             }
1191         }
1192     }
1193
1194   return REG_NOERROR;
1195 }
1196
1197 /* Insert the new element ELEM to the re_node_set* SET.
1198    SET should not already have ELEM.
1199    Return true if successful.  */
1200
1201 static bool
1202 internal_function
1203 re_node_set_insert (re_node_set *set, Idx elem)
1204 {
1205   Idx idx;
1206   /* In case the set is empty.  */
1207   if (set->alloc == 0)
1208     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1209
1210   if (BE (set->nelem, 0) == 0)
1211     {
1212       /* We already guaranteed above that set->alloc != 0.  */
1213       set->elems[0] = elem;
1214       ++set->nelem;
1215       return true;
1216     }
1217
1218   /* Realloc if we need.  */
1219   if (set->alloc == set->nelem)
1220     {
1221       Idx *new_elems;
1222       set->alloc = set->alloc * 2;
1223       new_elems = re_realloc (set->elems, Idx, set->alloc);
1224       if (BE (new_elems == NULL, 0))
1225         return false;
1226       set->elems = new_elems;
1227     }
1228
1229   /* Move the elements which follows the new element.  Test the
1230      first element separately to skip a check in the inner loop.  */
1231   if (elem < set->elems[0])
1232     {
1233       idx = 0;
1234       for (idx = set->nelem; idx > 0; idx--)
1235         set->elems[idx] = set->elems[idx - 1];
1236     }
1237   else
1238     {
1239       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1240         set->elems[idx] = set->elems[idx - 1];
1241     }
1242
1243   /* Insert the new element.  */
1244   set->elems[idx] = elem;
1245   ++set->nelem;
1246   return true;
1247 }
1248
1249 /* Insert the new element ELEM to the re_node_set* SET.
1250    SET should not already have any element greater than or equal to ELEM.
1251    Return true if successful.  */
1252
1253 static bool
1254 internal_function
1255 re_node_set_insert_last (re_node_set *set, Idx elem)
1256 {
1257   /* Realloc if we need.  */
1258   if (set->alloc == set->nelem)
1259     {
1260       Idx *new_elems;
1261       set->alloc = (set->alloc + 1) * 2;
1262       new_elems = re_realloc (set->elems, Idx, set->alloc);
1263       if (BE (new_elems == NULL, 0))
1264         return false;
1265       set->elems = new_elems;
1266     }
1267
1268   /* Insert the new element.  */
1269   set->elems[set->nelem++] = elem;
1270   return true;
1271 }
1272
1273 /* Compare two node sets SET1 and SET2.
1274    Return true if SET1 and SET2 are equivalent.  */
1275
1276 static bool
1277 internal_function __attribute ((pure))
1278 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1279 {
1280   Idx i;
1281   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1282     return false;
1283   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1284     if (set1->elems[i] != set2->elems[i])
1285       return false;
1286   return true;
1287 }
1288
1289 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1290
1291 static Idx
1292 internal_function __attribute ((pure))
1293 re_node_set_contains (const re_node_set *set, Idx elem)
1294 {
1295   __re_size_t idx, right, mid;
1296   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1297     return 0;
1298
1299   /* Binary search the element.  */
1300   idx = 0;
1301   right = set->nelem - 1;
1302   while (idx < right)
1303     {
1304       mid = (idx + right) / 2;
1305       if (set->elems[mid] < elem)
1306         idx = mid + 1;
1307       else
1308         right = mid;
1309     }
1310   return set->elems[idx] == elem ? idx + 1 : 0;
1311 }
1312
1313 static void
1314 internal_function
1315 re_node_set_remove_at (re_node_set *set, Idx idx)
1316 {
1317   if (idx < 0 || idx >= set->nelem)
1318     return;
1319   --set->nelem;
1320   for (; idx < set->nelem; idx++)
1321     set->elems[idx] = set->elems[idx + 1];
1322 }
1323 \f
1324
1325 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1326    Or return REG_MISSING if an error occurred.  */
1327
1328 static Idx
1329 internal_function
1330 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1331 {
1332   int type = token.type;
1333   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1334     {
1335       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1336       Idx *new_nexts, *new_indices;
1337       re_node_set *new_edests, *new_eclosures;
1338       re_token_t *new_nodes;
1339       size_t max_object_size =
1340         MAX (sizeof (re_token_t),
1341              MAX (sizeof (re_node_set),
1342                   sizeof (Idx)));
1343
1344       /* Avoid overflows.  */
1345       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1346         return REG_MISSING;
1347
1348       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1349       if (BE (new_nodes == NULL, 0))
1350         return REG_MISSING;
1351       dfa->nodes = new_nodes;
1352       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1353       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1354       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1355       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1356       if (BE (new_nexts == NULL || new_indices == NULL
1357               || new_edests == NULL || new_eclosures == NULL, 0))
1358         return REG_MISSING;
1359       dfa->nexts = new_nexts;
1360       dfa->org_indices = new_indices;
1361       dfa->edests = new_edests;
1362       dfa->eclosures = new_eclosures;
1363       dfa->nodes_alloc = new_nodes_alloc;
1364     }
1365   dfa->nodes[dfa->nodes_len] = token;
1366   dfa->nodes[dfa->nodes_len].constraint = 0;
1367 #ifdef RE_ENABLE_I18N
1368   dfa->nodes[dfa->nodes_len].accept_mb =
1369     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1370 #endif
1371   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1372   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1373   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1374   return dfa->nodes_len++;
1375 }
1376
1377 static inline re_hashval_t
1378 internal_function
1379 calc_state_hash (const re_node_set *nodes, unsigned int context)
1380 {
1381   re_hashval_t hash = nodes->nelem + context;
1382   Idx i;
1383   for (i = 0 ; i < nodes->nelem ; i++)
1384     hash += nodes->elems[i];
1385   return hash;
1386 }
1387
1388 /* Search for the state whose node_set is equivalent to NODES.
1389    Return the pointer to the state, if we found it in the DFA.
1390    Otherwise create the new one and return it.  In case of an error
1391    return NULL and set the error code in ERR.
1392    Note: - We assume NULL as the invalid state, then it is possible that
1393            return value is NULL and ERR is REG_NOERROR.
1394          - We never return non-NULL value in case of any errors, it is for
1395            optimization.  */
1396
1397 static re_dfastate_t *
1398 internal_function
1399 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1400                   const re_node_set *nodes)
1401 {
1402   re_hashval_t hash;
1403   re_dfastate_t *new_state;
1404   struct re_state_table_entry *spot;
1405   Idx i;
1406 #ifdef lint
1407   /* Suppress bogus uninitialized-variable warnings.  */
1408   *err = REG_NOERROR;
1409 #endif
1410   if (BE (nodes->nelem == 0, 0))
1411     {
1412       *err = REG_NOERROR;
1413       return NULL;
1414     }
1415   hash = calc_state_hash (nodes, 0);
1416   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1417
1418   for (i = 0 ; i < spot->num ; i++)
1419     {
1420       re_dfastate_t *state = spot->array[i];
1421       if (hash != state->hash)
1422         continue;
1423       if (re_node_set_compare (&state->nodes, nodes))
1424         return state;
1425     }
1426
1427   /* There are no appropriate state in the dfa, create the new one.  */
1428   new_state = create_ci_newstate (dfa, nodes, hash);
1429   if (BE (new_state == NULL, 0))
1430     *err = REG_ESPACE;
1431
1432   return new_state;
1433 }
1434
1435 /* Search for the state whose node_set is equivalent to NODES and
1436    whose context is equivalent to CONTEXT.
1437    Return the pointer to the state, if we found it in the DFA.
1438    Otherwise create the new one and return it.  In case of an error
1439    return NULL and set the error code in ERR.
1440    Note: - We assume NULL as the invalid state, then it is possible that
1441            return value is NULL and ERR is REG_NOERROR.
1442          - We never return non-NULL value in case of any errors, it is for
1443            optimization.  */
1444
1445 static re_dfastate_t *
1446 internal_function
1447 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1448                           const re_node_set *nodes, unsigned int context)
1449 {
1450   re_hashval_t hash;
1451   re_dfastate_t *new_state;
1452   struct re_state_table_entry *spot;
1453   Idx i;
1454 #ifdef lint
1455   /* Suppress bogus uninitialized-variable warnings.  */
1456   *err = REG_NOERROR;
1457 #endif
1458   if (nodes->nelem == 0)
1459     {
1460       *err = REG_NOERROR;
1461       return NULL;
1462     }
1463   hash = calc_state_hash (nodes, context);
1464   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1465
1466   for (i = 0 ; i < spot->num ; i++)
1467     {
1468       re_dfastate_t *state = spot->array[i];
1469       if (state->hash == hash
1470           && state->context == context
1471           && re_node_set_compare (state->entrance_nodes, nodes))
1472         return state;
1473     }
1474   /* There are no appropriate state in `dfa', create the new one.  */
1475   new_state = create_cd_newstate (dfa, nodes, context, hash);
1476   if (BE (new_state == NULL, 0))
1477     *err = REG_ESPACE;
1478
1479   return new_state;
1480 }
1481
1482 /* Finish initialization of the new state NEWSTATE, and using its hash value
1483    HASH put in the appropriate bucket of DFA's state table.  Return value
1484    indicates the error code if failed.  */
1485
1486 static reg_errcode_t
1487 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1488                 re_hashval_t hash)
1489 {
1490   struct re_state_table_entry *spot;
1491   reg_errcode_t err;
1492   Idx i;
1493
1494   newstate->hash = hash;
1495   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1496   if (BE (err != REG_NOERROR, 0))
1497     return REG_ESPACE;
1498   for (i = 0; i < newstate->nodes.nelem; i++)
1499     {
1500       Idx elem = newstate->nodes.elems[i];
1501       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1502         if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1503           return REG_ESPACE;
1504     }
1505
1506   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1507   if (BE (spot->alloc <= spot->num, 0))
1508     {
1509       Idx new_alloc = 2 * spot->num + 2;
1510       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1511                                               new_alloc);
1512       if (BE (new_array == NULL, 0))
1513         return REG_ESPACE;
1514       spot->array = new_array;
1515       spot->alloc = new_alloc;
1516     }
1517   spot->array[spot->num++] = newstate;
1518   return REG_NOERROR;
1519 }
1520
1521 static void
1522 free_state (re_dfastate_t *state)
1523 {
1524   re_node_set_free (&state->non_eps_nodes);
1525   re_node_set_free (&state->inveclosure);
1526   if (state->entrance_nodes != &state->nodes)
1527     {
1528       re_node_set_free (state->entrance_nodes);
1529       re_free (state->entrance_nodes);
1530     }
1531   re_node_set_free (&state->nodes);
1532   re_free (state->word_trtable);
1533   re_free (state->trtable);
1534   re_free (state);
1535 }
1536
1537 /* Create the new state which is independ of contexts.
1538    Return the new state if succeeded, otherwise return NULL.  */
1539
1540 static re_dfastate_t *
1541 internal_function
1542 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1543                     re_hashval_t hash)
1544 {
1545   Idx i;
1546   reg_errcode_t err;
1547   re_dfastate_t *newstate;
1548
1549   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1550   if (BE (newstate == NULL, 0))
1551     return NULL;
1552   err = re_node_set_init_copy (&newstate->nodes, nodes);
1553   if (BE (err != REG_NOERROR, 0))
1554     {
1555       re_free (newstate);
1556       return NULL;
1557     }
1558
1559   newstate->entrance_nodes = &newstate->nodes;
1560   for (i = 0 ; i < nodes->nelem ; i++)
1561     {
1562       re_token_t *node = dfa->nodes + nodes->elems[i];
1563       re_token_type_t type = node->type;
1564       if (type == CHARACTER && !node->constraint)
1565         continue;
1566 #ifdef RE_ENABLE_I18N
1567       newstate->accept_mb |= node->accept_mb;
1568 #endif /* RE_ENABLE_I18N */
1569
1570       /* If the state has the halt node, the state is a halt state.  */
1571       if (type == END_OF_RE)
1572         newstate->halt = 1;
1573       else if (type == OP_BACK_REF)
1574         newstate->has_backref = 1;
1575       else if (type == ANCHOR || node->constraint)
1576         newstate->has_constraint = 1;
1577     }
1578   err = register_state (dfa, newstate, hash);
1579   if (BE (err != REG_NOERROR, 0))
1580     {
1581       free_state (newstate);
1582       newstate = NULL;
1583     }
1584   return newstate;
1585 }
1586
1587 /* Create the new state which is depend on the context CONTEXT.
1588    Return the new state if succeeded, otherwise return NULL.  */
1589
1590 static re_dfastate_t *
1591 internal_function
1592 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1593                     unsigned int context, re_hashval_t hash)
1594 {
1595   Idx i, nctx_nodes = 0;
1596   reg_errcode_t err;
1597   re_dfastate_t *newstate;
1598
1599   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1600   if (BE (newstate == NULL, 0))
1601     return NULL;
1602   err = re_node_set_init_copy (&newstate->nodes, nodes);
1603   if (BE (err != REG_NOERROR, 0))
1604     {
1605       re_free (newstate);
1606       return NULL;
1607     }
1608
1609   newstate->context = context;
1610   newstate->entrance_nodes = &newstate->nodes;
1611
1612   for (i = 0 ; i < nodes->nelem ; i++)
1613     {
1614       unsigned int constraint = 0;
1615       re_token_t *node = dfa->nodes + nodes->elems[i];
1616       re_token_type_t type = node->type;
1617       if (node->constraint)
1618         constraint = node->constraint;
1619
1620       if (type == CHARACTER && !constraint)
1621         continue;
1622 #ifdef RE_ENABLE_I18N
1623       newstate->accept_mb |= node->accept_mb;
1624 #endif /* RE_ENABLE_I18N */
1625
1626       /* If the state has the halt node, the state is a halt state.  */
1627       if (type == END_OF_RE)
1628         newstate->halt = 1;
1629       else if (type == OP_BACK_REF)
1630         newstate->has_backref = 1;
1631       else if (type == ANCHOR)
1632         constraint = node->opr.ctx_type;
1633
1634       if (constraint)
1635         {
1636           if (newstate->entrance_nodes == &newstate->nodes)
1637             {
1638               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1639               if (BE (newstate->entrance_nodes == NULL, 0))
1640                 {
1641                   free_state (newstate);
1642                   return NULL;
1643                 }
1644               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1645               nctx_nodes = 0;
1646               newstate->has_constraint = 1;
1647             }
1648
1649           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1650             {
1651               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1652               ++nctx_nodes;
1653             }
1654         }
1655     }
1656   err = register_state (dfa, newstate, hash);
1657   if (BE (err != REG_NOERROR, 0))
1658     {
1659       free_state (newstate);
1660       newstate = NULL;
1661     }
1662   return  newstate;
1663 }