lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         REG_TRANSLATE_TYPE trans, int icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash) internal_function;
  31 \f
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 internal_function
  39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  40                     REG_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   Idx init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 internal_function
  67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  68                      REG_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif /* RE_ENABLE_I18N  */
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 internal_function
 130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 136       if (BE (new_wcs == NULL, 0))
 137         return REG_ESPACE;
 138       pstr->wcs = new_wcs;
 139       if (pstr->offsets != NULL)
 140         {
 141           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 142           if (BE (new_offsets == NULL, 0))
 143             return REG_ESPACE;
 144           pstr->offsets = new_offsets;
 145         }
 146     }
 147 #endif /* RE_ENABLE_I18N  */
 148   if (pstr->mbs_allocated)
 149     {
 150       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 151                                            new_buf_len);
 152       if (BE (new_mbs == NULL, 0))
 153         return REG_ESPACE;
 154       pstr->mbs = new_mbs;
 155     }
 156   pstr->bufs_len = new_buf_len;
 157   return REG_NOERROR;
 158 }
 159
 160
 161 static void
 162 internal_function
 163 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 164                             REG_TRANSLATE_TYPE trans, int icase,
 165                             const re_dfa_t *dfa)
 166 {
 167   pstr->raw_mbs = (const unsigned char *) str;
 168   pstr->len = len;
 169   pstr->raw_len = len;
 170   pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
 171   pstr->icase = icase ? 1 : 0;
 172   pstr->mbs_allocated = (trans != NULL || icase);
 173   pstr->mb_cur_max = dfa->mb_cur_max;
 174   pstr->is_utf8 = dfa->is_utf8;
 175   pstr->map_notascii = dfa->map_notascii;
 176   pstr->stop = pstr->len;
 177   pstr->raw_stop = pstr->stop;
 178 }
 179
 180 #ifdef RE_ENABLE_I18N
 181
 182 /* Build wide character buffer PSTR->WCS.
 183    If the byte sequence of the string are:
 184      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 185    Then wide character buffer will be:
 186      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 187    We use WEOF for padding, they indicate that the position isn't
 188    a first byte of a multibyte character.
 189
 190    Note that this function assumes PSTR->VALID_LEN elements are already
 191    built and starts from PSTR->VALID_LEN.  */
 192
 193 static void
 194 internal_function
 195 build_wcs_buffer (re_string_t *pstr)
 196 {
 197 #ifdef _LIBC
 198   unsigned char buf[MB_LEN_MAX];
 199   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 200 #else
 201   unsigned char buf[64];
 202 #endif
 203   mbstate_t prev_st;
 204   Idx byte_idx, end_idx, remain_len;
 205   size_t mbclen;
 206
 207   /* Build the buffers from pstr->valid_len to either pstr->len or
 208      pstr->bufs_len.  */
 209   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 210   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 211     {
 212       wchar_t wc;
 213       const char *p;
 214
 215       remain_len = end_idx - byte_idx;
 216       prev_st = pstr->cur_state;
 217       /* Apply the translation if we need.  */
 218       if (BE (pstr->trans != NULL, 0))
 219         {
 220           int i, ch;
 221
 222           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 223             {
 224               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 225               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 226             }
 227           p = (const char *) buf;
 228         }
 229       else
 230         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 231       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 232       if (BE (mbclen == (size_t) -2, 0))
 233         {
 234           /* The buffer doesn't have enough space, finish to build.  */
 235           pstr->cur_state = prev_st;
 236           break;
 237         }
 238       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 239         {
 240           /* We treat these cases as a singlebyte character.  */
 241           mbclen = 1;
 242           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 243           if (BE (pstr->trans != NULL, 0))
 244             wc = pstr->trans[wc];
 245           pstr->cur_state = prev_st;
 246         }
 247
 248       /* Write wide character and padding.  */
 249       pstr->wcs[byte_idx++] = wc;
 250       /* Write paddings.  */
 251       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 252         pstr->wcs[byte_idx++] = WEOF;
 253     }
 254   pstr->valid_len = byte_idx;
 255   pstr->valid_raw_len = byte_idx;
 256 }
 257
 258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 259    but for REG_ICASE.  */
 260
 261 static reg_errcode_t
 262 internal_function
 263 build_wcs_upper_buffer (re_string_t *pstr)
 264 {
 265   mbstate_t prev_st;
 266   Idx src_idx, byte_idx, end_idx, remain_len;
 267   size_t mbclen;
 268 #ifdef _LIBC
 269   char buf[MB_LEN_MAX];
 270   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 271 #else
 272   char buf[64];
 273 #endif
 274
 275   byte_idx = pstr->valid_len;
 276   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 277
 278   /* The following optimization assumes that ASCII characters can be
 279      mapped to wide characters with a simple cast.  */
 280   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 281     {
 282       while (byte_idx < end_idx)
 283         {
 284           wchar_t wc;
 285
 286           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 287               && mbsinit (&pstr->cur_state))
 288             {
 289               /* In case of a singlebyte character.  */
 290               pstr->mbs[byte_idx]
 291                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 292               /* The next step uses the assumption that wchar_t is encoded
 293                  ASCII-safe: all ASCII values can be converted like this.  */
 294               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 295               ++byte_idx;
 296               continue;
 297             }
 298
 299           remain_len = end_idx - byte_idx;
 300           prev_st = pstr->cur_state;
 301           mbclen = mbrtowc (&wc,
 302                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 303                              + byte_idx), remain_len, &pstr->cur_state);
 304           if (BE (mbclen + 2 > 2, 1))
 305             {
 306               wchar_t wcu = wc;
 307               if (iswlower (wc))
 308                 {
 309                   size_t mbcdlen;
 310
 311                   wcu = towupper (wc);
 312                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 313                   if (BE (mbclen == mbcdlen, 1))
 314                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 315                   else
 316                     {
 317                       src_idx = byte_idx;
 318                       goto offsets_needed;
 319                     }
 320                 }
 321               else
 322                 memcpy (pstr->mbs + byte_idx,
 323                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 324               pstr->wcs[byte_idx++] = wcu;
 325               /* Write paddings.  */
 326               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 327                 pstr->wcs[byte_idx++] = WEOF;
 328             }
 329           else if (mbclen == (size_t) -1 || mbclen == 0)
 330             {
 331               /* It is an invalid character or '\0'.  Just use the byte.  */
 332               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 333               pstr->mbs[byte_idx] = ch;
 334               /* And also cast it to wide char.  */
 335               pstr->wcs[byte_idx++] = (wchar_t) ch;
 336               if (BE (mbclen == (size_t) -1, 0))
 337                 pstr->cur_state = prev_st;
 338             }
 339           else
 340             {
 341               /* The buffer doesn't have enough space, finish to build.  */
 342               pstr->cur_state = prev_st;
 343               break;
 344             }
 345         }
 346       pstr->valid_len = byte_idx;
 347       pstr->valid_raw_len = byte_idx;
 348       return REG_NOERROR;
 349     }
 350   else
 351     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 352       {
 353         wchar_t wc;
 354         const char *p;
 355       offsets_needed:
 356         remain_len = end_idx - byte_idx;
 357         prev_st = pstr->cur_state;
 358         if (BE (pstr->trans != NULL, 0))
 359           {
 360             int i, ch;
 361
 362             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 363               {
 364                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 365                 buf[i] = pstr->trans[ch];
 366               }
 367             p = (const char *) buf;
 368           }
 369         else
 370           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 371         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 372         if (BE (mbclen + 2 > 2, 1))
 373           {
 374             wchar_t wcu = wc;
 375             if (iswlower (wc))
 376               {
 377                 size_t mbcdlen;
 378
 379                 wcu = towupper (wc);
 380                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 381                 if (BE (mbclen == mbcdlen, 1))
 382                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 383                 else if (mbcdlen != (size_t) -1)
 384                   {
 385                     size_t i;
 386
 387                     if (byte_idx + mbcdlen > pstr->bufs_len)
 388                       {
 389                         pstr->cur_state = prev_st;
 390                         break;
 391                       }
 392
 393                     if (pstr->offsets == NULL)
 394                       {
 395                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 396
 397                         if (pstr->offsets == NULL)
 398                           return REG_ESPACE;
 399                       }
 400                     if (!pstr->offsets_needed)
 401                       {
 402                         for (i = 0; i < (size_t) byte_idx; ++i)
 403                           pstr->offsets[i] = i;
 404                         pstr->offsets_needed = 1;
 405                       }
 406
 407                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 408                     pstr->wcs[byte_idx] = wcu;
 409                     pstr->offsets[byte_idx] = src_idx;
 410                     for (i = 1; i < mbcdlen; ++i)
 411                       {
 412                         pstr->offsets[byte_idx + i]
 413                           = src_idx + (i < mbclen ? i : mbclen - 1);
 414                         pstr->wcs[byte_idx + i] = WEOF;
 415                       }
 416                     pstr->len += mbcdlen - mbclen;
 417                     if (pstr->raw_stop > src_idx)
 418                       pstr->stop += mbcdlen - mbclen;
 419                     end_idx = (pstr->bufs_len > pstr->len)
 420                               ? pstr->len : pstr->bufs_len;
 421                     byte_idx += mbcdlen;
 422                     src_idx += mbclen;
 423                     continue;
 424                   }
 425                 else
 426                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 427               }
 428             else
 429               memcpy (pstr->mbs + byte_idx, p, mbclen);
 430
 431             if (BE (pstr->offsets_needed != 0, 0))
 432               {
 433                 size_t i;
 434                 for (i = 0; i < mbclen; ++i)
 435                   pstr->offsets[byte_idx + i] = src_idx + i;
 436               }
 437             src_idx += mbclen;
 438
 439             pstr->wcs[byte_idx++] = wcu;
 440             /* Write paddings.  */
 441             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 442               pstr->wcs[byte_idx++] = WEOF;
 443           }
 444         else if (mbclen == (size_t) -1 || mbclen == 0)
 445           {
 446             /* It is an invalid character or '\0'.  Just use the byte.  */
 447             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 448
 449             if (BE (pstr->trans != NULL, 0))
 450               ch = pstr->trans [ch];
 451             pstr->mbs[byte_idx] = ch;
 452
 453             if (BE (pstr->offsets_needed != 0, 0))
 454               pstr->offsets[byte_idx] = src_idx;
 455             ++src_idx;
 456
 457             /* And also cast it to wide char.  */
 458             pstr->wcs[byte_idx++] = (wchar_t) ch;
 459             if (BE (mbclen == (size_t) -1, 0))
 460               pstr->cur_state = prev_st;
 461           }
 462         else
 463           {
 464             /* The buffer doesn't have enough space, finish to build.  */
 465             pstr->cur_state = prev_st;
 466             break;
 467           }
 468       }
 469   pstr->valid_len = byte_idx;
 470   pstr->valid_raw_len = src_idx;
 471   return REG_NOERROR;
 472 }
 473
 474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 475    Return the index.  */
 476
 477 static Idx
 478 internal_function
 479 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 480 {
 481   mbstate_t prev_st;
 482   Idx rawbuf_idx;
 483   size_t mbclen;
 484   wchar_t wc = 0;
 485
 486   /* Skip the characters which are not necessary to check.  */
 487   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 488        rawbuf_idx < new_raw_idx;)
 489     {
 490       Idx remain_len;
 491       remain_len = pstr->len - rawbuf_idx;
 492       prev_st = pstr->cur_state;
 493       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 494                         remain_len, &pstr->cur_state);
 495       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 496         {
 497           /* We treat these cases as a singlebyte character.  */
 498           mbclen = 1;
 499           pstr->cur_state = prev_st;
 500         }
 501       /* Then proceed the next character.  */
 502       rawbuf_idx += mbclen;
 503     }
 504   *last_wc = (wint_t) wc;
 505   return rawbuf_idx;
 506 }
 507 #endif /* RE_ENABLE_I18N  */
 508
 509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 510    This function is used in case of REG_ICASE.  */
 511
 512 static void
 513 internal_function
 514 build_upper_buffer (re_string_t *pstr)
 515 {
 516   Idx char_idx, end_idx;
 517   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 518
 519   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 520     {
 521       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 522       if (BE (pstr->trans != NULL, 0))
 523         ch = pstr->trans[ch];
 524       if (islower (ch))
 525         pstr->mbs[char_idx] = toupper (ch);
 526       else
 527         pstr->mbs[char_idx] = ch;
 528     }
 529   pstr->valid_len = char_idx;
 530   pstr->valid_raw_len = char_idx;
 531 }
 532
 533 /* Apply TRANS to the buffer in PSTR.  */
 534
 535 static void
 536 internal_function
 537 re_string_translate_buffer (re_string_t *pstr)
 538 {
 539   Idx buf_idx, end_idx;
 540   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 541
 542   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 543     {
 544       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 545       pstr->mbs[buf_idx] = pstr->trans[ch];
 546     }
 547
 548   pstr->valid_len = buf_idx;
 549   pstr->valid_raw_len = buf_idx;
 550 }
 551
 552 /* This function re-construct the buffers.
 553    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 554    convert to upper case in case of REG_ICASE, apply translation.  */
 555
 556 static reg_errcode_t
 557 internal_function
 558 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 559 {
 560   regoff_t offset = (regoff_t) idx - (regoff_t) pstr->raw_mbs_idx;
 561   if (BE (offset < 0, 0))
 562     {
 563       /* Reset buffer.  */
 564 #ifdef RE_ENABLE_I18N
 565       if (pstr->mb_cur_max > 1)
 566         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 567 #endif /* RE_ENABLE_I18N */
 568       pstr->len = pstr->raw_len;
 569       pstr->stop = pstr->raw_stop;
 570       pstr->valid_len = 0;
 571       pstr->raw_mbs_idx = 0;
 572       pstr->valid_raw_len = 0;
 573       pstr->offsets_needed = 0;
 574       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 575                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 576       if (!pstr->mbs_allocated)
 577         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 578       offset = idx;
 579     }
 580
 581   if (BE (offset != 0, 1))
 582     {
 583       /* Are the characters which are already checked remain?  */
 584       if (BE (offset < pstr->valid_raw_len, 1)
 585 #ifdef RE_ENABLE_I18N
 586           /* Handling this would enlarge the code too much.
 587              Accept a slowdown in that case.  */
 588           && pstr->offsets_needed == 0
 589 #endif
 590          )
 591         {
 592           /* Yes, move them to the front of the buffer.  */
 593           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 594 #ifdef RE_ENABLE_I18N
 595           if (pstr->mb_cur_max > 1)
 596             memmove (pstr->wcs, pstr->wcs + offset,
 597                      (pstr->valid_len - offset) * sizeof (wint_t));
 598 #endif /* RE_ENABLE_I18N */
 599           if (BE (pstr->mbs_allocated, 0))
 600             memmove (pstr->mbs, pstr->mbs + offset,
 601                      pstr->valid_len - offset);
 602           pstr->valid_len -= offset;
 603           pstr->valid_raw_len -= offset;
 604 #if DEBUG
 605           assert (pstr->valid_len > 0);
 606 #endif
 607         }
 608       else
 609         {
 610           /* No, skip all characters until IDX.  */
 611 #ifdef RE_ENABLE_I18N
 612           if (BE (pstr->offsets_needed, 0))
 613             {
 614               pstr->len = pstr->raw_len - idx + offset;
 615               pstr->stop = pstr->raw_stop - idx + offset;
 616               pstr->offsets_needed = 0;
 617             }
 618 #endif
 619           pstr->valid_len = 0;
 620           pstr->valid_raw_len = 0;
 621 #ifdef RE_ENABLE_I18N
 622           if (pstr->mb_cur_max > 1)
 623             {
 624               Idx wcs_idx;
 625               wint_t wc = WEOF;
 626
 627               if (pstr->is_utf8)
 628                 {
 629                   const unsigned char *raw, *p, *q, *end;
 630
 631                   /* Special case UTF-8.  Multi-byte chars start with any
 632                      byte other than 0x80 - 0xbf.  */
 633                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 634                   end = raw + (offset - pstr->mb_cur_max);
 635                   for (p = raw + offset - 1; p >= end; --p)
 636                     if ((*p & 0xc0) != 0x80)
 637                       {
 638                         mbstate_t cur_state;
 639                         wchar_t wc2;
 640                         Idx mlen = raw + pstr->len - p;
 641                         unsigned char buf[6];
 642
 643                         q = p;
 644                         if (BE (pstr->trans != NULL, 0))
 645                           {
 646                             int i = mlen < 6 ? mlen : 6;
 647                             while (--i >= 0)
 648                               buf[i] = pstr->trans[p[i]];
 649                             q = buf;
 650                           }
 651                         /* XXX Don't use mbrtowc, we know which conversion
 652                            to use (UTF-8 -> UCS4).  */
 653                         memset (&cur_state, 0, sizeof (cur_state));
 654                         mlen = (mbrtowc (&wc2, (const char *) p, mlen,
 655                                          &cur_state)
 656                                 - (raw + offset - p));
 657                         if (mlen >= 0)
 658                           {
 659                             memset (&pstr->cur_state, '\0',
 660                                     sizeof (mbstate_t));
 661                             pstr->valid_len = mlen;
 662                             wc = wc2;
 663                           }
 664                         break;
 665                       }
 666                 }
 667
 668               if (wc == WEOF)
 669                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 670               if (BE (pstr->valid_len, 0))
 671                 {
 672                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 673                     pstr->wcs[wcs_idx] = WEOF;
 674                   if (pstr->mbs_allocated)
 675                     memset (pstr->mbs, 255, pstr->valid_len);
 676                 }
 677               pstr->valid_raw_len = pstr->valid_len;
 678               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 679                                     && IS_WIDE_WORD_CHAR (wc))
 680                                    ? CONTEXT_WORD
 681                                    : ((IS_WIDE_NEWLINE (wc)
 682                                        && pstr->newline_anchor)
 683                                       ? CONTEXT_NEWLINE : 0));
 684             }
 685           else
 686 #endif /* RE_ENABLE_I18N */
 687             {
 688               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 689               if (pstr->trans)
 690                 c = pstr->trans[c];
 691               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 692                                    ? CONTEXT_WORD
 693                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 694                                       ? CONTEXT_NEWLINE : 0));
 695             }
 696         }
 697       if (!BE (pstr->mbs_allocated, 0))
 698         pstr->mbs += offset;
 699     }
 700   pstr->raw_mbs_idx = idx;
 701   pstr->len -= offset;
 702   pstr->stop -= offset;
 703
 704   /* Then build the buffers.  */
 705 #ifdef RE_ENABLE_I18N
 706   if (pstr->mb_cur_max > 1)
 707     {
 708       if (pstr->icase)
 709         {
 710           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 711           if (BE (ret != REG_NOERROR, 0))
 712             return ret;
 713         }
 714       else
 715         build_wcs_buffer (pstr);
 716     }
 717   else
 718 #endif /* RE_ENABLE_I18N */
 719   if (BE (pstr->mbs_allocated, 0))
 720     {
 721       if (pstr->icase)
 722         build_upper_buffer (pstr);
 723       else if (pstr->trans != NULL)
 724         re_string_translate_buffer (pstr);
 725     }
 726   else
 727     pstr->valid_len = pstr->len;
 728
 729   pstr->cur_idx = 0;
 730   return REG_NOERROR;
 731 }
 732
 733 static unsigned char
 734 internal_function __attribute ((pure))
 735 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 736 {
 737   int ch;
 738   Idx off;
 739
 740   /* Handle the common (easiest) cases first.  */
 741   if (BE (!pstr->mbs_allocated, 1))
 742     return re_string_peek_byte (pstr, idx);
 743
 744 #ifdef RE_ENABLE_I18N
 745   if (pstr->mb_cur_max > 1
 746       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 747     return re_string_peek_byte (pstr, idx);
 748 #endif
 749
 750   off = pstr->cur_idx + idx;
 751 #ifdef RE_ENABLE_I18N
 752   if (pstr->offsets_needed)
 753     off = pstr->offsets[off];
 754 #endif
 755
 756   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 757
 758 #ifdef RE_ENABLE_I18N
 759   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 760      this function returns CAPITAL LETTER I instead of first byte of
 761      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 762      since peek_byte_case doesn't advance cur_idx in any way.  */
 763   if (pstr->offsets_needed && !isascii (ch))
 764     return re_string_peek_byte (pstr, idx);
 765 #endif
 766
 767   return ch;
 768 }
 769
 770 static unsigned char
 771 internal_function __attribute ((pure))
 772 re_string_fetch_byte_case (re_string_t *pstr)
 773 {
 774   if (BE (!pstr->mbs_allocated, 1))
 775     return re_string_fetch_byte (pstr);
 776
 777 #ifdef RE_ENABLE_I18N
 778   if (pstr->offsets_needed)
 779     {
 780       Idx off;
 781       int ch;
 782
 783       /* For tr_TR.UTF-8 [[:islower:]] there is
 784          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 785          in that case the whole multi-byte character and return
 786          the original letter.  On the other side, with
 787          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 788          anything else would complicate things too much.  */
 789
 790       if (!re_string_first_byte (pstr, pstr->cur_idx))
 791         return re_string_fetch_byte (pstr);
 792
 793       off = pstr->offsets[pstr->cur_idx];
 794       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 795
 796       if (! isascii (ch))
 797         return re_string_fetch_byte (pstr);
 798
 799       re_string_skip_bytes (pstr,
 800                             re_string_char_size_at (pstr, pstr->cur_idx));
 801       return ch;
 802     }
 803 #endif
 804
 805   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 806 }
 807
 808 static void
 809 internal_function
 810 re_string_destruct (re_string_t *pstr)
 811 {
 812 #ifdef RE_ENABLE_I18N
 813   re_free (pstr->wcs);
 814   re_free (pstr->offsets);
 815 #endif /* RE_ENABLE_I18N  */
 816   if (pstr->mbs_allocated)
 817     re_free (pstr->mbs);
 818 }
 819
 820 /* Return the context at IDX in INPUT.  */
 821
 822 static unsigned int
 823 internal_function
 824 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 825 {
 826   int c;
 827   if (BE (! REG_VALID_INDEX (idx), 0))
 828     /* In this case, we use the value stored in input->tip_context,
 829        since we can't know the character in input->mbs[-1] here.  */
 830     return input->tip_context;
 831   if (BE (idx == input->len, 0))
 832     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 833             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 834 #ifdef RE_ENABLE_I18N
 835   if (input->mb_cur_max > 1)
 836     {
 837       wint_t wc;
 838       Idx wc_idx = idx;
 839       while(input->wcs[wc_idx] == WEOF)
 840         {
 841 #ifdef DEBUG
 842           /* It must not happen.  */
 843           assert (wc_idx >= 0);
 844 #endif
 845           --wc_idx;
 846           if (wc_idx < 0)
 847             return input->tip_context;
 848         }
 849       wc = input->wcs[wc_idx];
 850       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 851         return CONTEXT_WORD;
 852       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 853               ? CONTEXT_NEWLINE : 0);
 854     }
 855   else
 856 #endif
 857     {
 858       c = re_string_byte_at (input, idx);
 859       if (bitset_contain (input->word_char, c))
 860         return CONTEXT_WORD;
 861       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 862     }
 863 }
 864 \f
 865 /* Functions for set operation.  */
 866
 867 static reg_errcode_t
 868 internal_function
 869 re_node_set_alloc (re_node_set *set, Idx size)
 870 {
 871   set->alloc = size;
 872   set->nelem = 0;
 873   set->elems = re_malloc (Idx, size);
 874   if (BE (set->elems == NULL, 0))
 875     return REG_ESPACE;
 876   return REG_NOERROR;
 877 }
 878
 879 static reg_errcode_t
 880 internal_function
 881 re_node_set_init_1 (re_node_set *set, Idx elem)
 882 {
 883   set->alloc = 1;
 884   set->nelem = 1;
 885   set->elems = re_malloc (Idx, 1);
 886   if (BE (set->elems == NULL, 0))
 887     {
 888       set->alloc = set->nelem = 0;
 889       return REG_ESPACE;
 890     }
 891   set->elems[0] = elem;
 892   return REG_NOERROR;
 893 }
 894
 895 static reg_errcode_t
 896 internal_function
 897 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 898 {
 899   set->alloc = 2;
 900   set->elems = re_malloc (Idx, 2);
 901   if (BE (set->elems == NULL, 0))
 902     return REG_ESPACE;
 903   if (elem1 == elem2)
 904     {
 905       set->nelem = 1;
 906       set->elems[0] = elem1;
 907     }
 908   else
 909     {
 910       set->nelem = 2;
 911       if (elem1 < elem2)
 912         {
 913           set->elems[0] = elem1;
 914           set->elems[1] = elem2;
 915         }
 916       else
 917         {
 918           set->elems[0] = elem2;
 919           set->elems[1] = elem1;
 920         }
 921     }
 922   return REG_NOERROR;
 923 }
 924
 925 static reg_errcode_t
 926 internal_function
 927 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 928 {
 929   dest->nelem = src->nelem;
 930   if (src->nelem > 0)
 931     {
 932       dest->alloc = dest->nelem;
 933       dest->elems = re_malloc (Idx, dest->alloc);
 934       if (BE (dest->elems == NULL, 0))
 935         {
 936           dest->alloc = dest->nelem = 0;
 937           return REG_ESPACE;
 938         }
 939       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
 940     }
 941   else
 942     re_node_set_init_empty (dest);
 943   return REG_NOERROR;
 944 }
 945
 946 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 947    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 948    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 949
 950 static reg_errcode_t
 951 internal_function
 952 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 953                            const re_node_set *src2)
 954 {
 955   Idx i1, i2, is, id, delta, sbase;
 956   if (src1->nelem == 0 || src2->nelem == 0)
 957     return REG_NOERROR;
 958
 959   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 960      conservative estimate.  */
 961   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 962     {
 963       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
 964       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
 965       if (BE (new_elems == NULL, 0))
 966         return REG_ESPACE;
 967       dest->elems = new_elems;
 968       dest->alloc = new_alloc;
 969     }
 970
 971   /* Find the items in the intersection of SRC1 and SRC2, and copy
 972      into the top of DEST those that are not already in DEST itself.  */
 973   sbase = dest->nelem + src1->nelem + src2->nelem;
 974   i1 = src1->nelem - 1;
 975   i2 = src2->nelem - 1;
 976   id = dest->nelem - 1;
 977   for (;;)
 978     {
 979       if (src1->elems[i1] == src2->elems[i2])
 980         {
 981           /* Try to find the item in DEST.  Maybe we could binary search?  */
 982           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
 983             --id;
 984
 985           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
 986             dest->elems[--sbase] = src1->elems[i1];
 987
 988           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
 989             break;
 990         }
 991
 992       /* Lower the highest of the two items.  */
 993       else if (src1->elems[i1] < src2->elems[i2])
 994         {
 995           if (! REG_VALID_INDEX (--i2))
 996             break;
 997         }
 998       else
 999         {
1000           if (! REG_VALID_INDEX (--i1))
1001             break;
1002         }
1003     }
1004
1005   id = dest->nelem - 1;
1006   is = dest->nelem + src1->nelem + src2->nelem - 1;
1007   delta = is - sbase + 1;
1008
1009   /* Now copy.  When DELTA becomes zero, the remaining
1010      DEST elements are already in place; this is more or
1011      less the same loop that is in re_node_set_merge.  */
1012   dest->nelem += delta;
1013   if (delta > 0 && REG_VALID_INDEX (id))
1014     for (;;)
1015       {
1016         if (dest->elems[is] > dest->elems[id])
1017           {
1018             /* Copy from the top.  */
1019             dest->elems[id + delta--] = dest->elems[is--];
1020             if (delta == 0)
1021               break;
1022           }
1023         else
1024           {
1025             /* Slide from the bottom.  */
1026             dest->elems[id + delta] = dest->elems[id];
1027             if (! REG_VALID_INDEX (--id))
1028               break;
1029           }
1030       }
1031
1032   /* Copy remaining SRC elements.  */
1033   memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1034
1035   return REG_NOERROR;
1036 }
1037
1038 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1039    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1040
1041 static reg_errcode_t
1042 internal_function
1043 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1044                         const re_node_set *src2)
1045 {
1046   Idx i1, i2, id;
1047   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1048     {
1049       dest->alloc = src1->nelem + src2->nelem;
1050       dest->elems = re_malloc (Idx, dest->alloc);
1051       if (BE (dest->elems == NULL, 0))
1052         return REG_ESPACE;
1053     }
1054   else
1055     {
1056       if (src1 != NULL && src1->nelem > 0)
1057         return re_node_set_init_copy (dest, src1);
1058       else if (src2 != NULL && src2->nelem > 0)
1059         return re_node_set_init_copy (dest, src2);
1060       else
1061         re_node_set_init_empty (dest);
1062       return REG_NOERROR;
1063     }
1064   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1065     {
1066       if (src1->elems[i1] > src2->elems[i2])
1067         {
1068           dest->elems[id++] = src2->elems[i2++];
1069           continue;
1070         }
1071       if (src1->elems[i1] == src2->elems[i2])
1072         ++i2;
1073       dest->elems[id++] = src1->elems[i1++];
1074     }
1075   if (i1 < src1->nelem)
1076     {
1077       memcpy (dest->elems + id, src1->elems + i1,
1078              (src1->nelem - i1) * sizeof dest->elems[0]);
1079       id += src1->nelem - i1;
1080     }
1081   else if (i2 < src2->nelem)
1082     {
1083       memcpy (dest->elems + id, src2->elems + i2,
1084              (src2->nelem - i2) * sizeof dest->elems[0]);
1085       id += src2->nelem - i2;
1086     }
1087   dest->nelem = id;
1088   return REG_NOERROR;
1089 }
1090
1091 /* Calculate the union set of the sets DEST and SRC. And store it to
1092    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1093
1094 static reg_errcode_t
1095 internal_function
1096 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1097 {
1098   Idx is, id, sbase, delta;
1099   if (src == NULL || src->nelem == 0)
1100     return REG_NOERROR;
1101   if (dest->alloc < 2 * src->nelem + dest->nelem)
1102     {
1103       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1104       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1105       if (BE (new_buffer == NULL, 0))
1106         return REG_ESPACE;
1107       dest->elems = new_buffer;
1108       dest->alloc = new_alloc;
1109     }
1110
1111   if (BE (dest->nelem == 0, 0))
1112     {
1113       dest->nelem = src->nelem;
1114       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1115       return REG_NOERROR;
1116     }
1117
1118   /* Copy into the top of DEST the items of SRC that are not
1119      found in DEST.  Maybe we could binary search in DEST?  */
1120   for (sbase = dest->nelem + 2 * src->nelem,
1121        is = src->nelem - 1, id = dest->nelem - 1;
1122        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1123     {
1124       if (dest->elems[id] == src->elems[is])
1125         is--, id--;
1126       else if (dest->elems[id] < src->elems[is])
1127         dest->elems[--sbase] = src->elems[is--];
1128       else /* if (dest->elems[id] > src->elems[is]) */
1129         --id;
1130     }
1131
1132   if (REG_VALID_INDEX (is))
1133     {
1134       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1135       sbase -= is + 1;
1136       memcpy (dest->elems + sbase, src->elems,
1137               (is + 1) * sizeof dest->elems[0]);
1138     }
1139
1140   id = dest->nelem - 1;
1141   is = dest->nelem + 2 * src->nelem - 1;
1142   delta = is - sbase + 1;
1143   if (delta == 0)
1144     return REG_NOERROR;
1145
1146   /* Now copy.  When DELTA becomes zero, the remaining
1147      DEST elements are already in place.  */
1148   dest->nelem += delta;
1149   for (;;)
1150     {
1151       if (dest->elems[is] > dest->elems[id])
1152         {
1153           /* Copy from the top.  */
1154           dest->elems[id + delta--] = dest->elems[is--];
1155           if (delta == 0)
1156             break;
1157         }
1158       else
1159         {
1160           /* Slide from the bottom.  */
1161           dest->elems[id + delta] = dest->elems[id];
1162           if (! REG_VALID_INDEX (--id))
1163             {
1164               /* Copy remaining SRC elements.  */
1165               memcpy (dest->elems, dest->elems + sbase,
1166                       delta * sizeof dest->elems[0]);
1167               break;
1168             }
1169         }
1170     }
1171
1172   return REG_NOERROR;
1173 }
1174
1175 /* Insert the new element ELEM to the re_node_set* SET.
1176    SET should not already have ELEM.
1177    return -1 if an error is occured, return 1 otherwise.  */
1178
1179 static int
1180 internal_function
1181 re_node_set_insert (re_node_set *set, Idx elem)
1182 {
1183   Idx idx;
1184   /* In case the set is empty.  */
1185   if (set->alloc == 0)
1186     {
1187       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1188         return 1;
1189       else
1190         return -1;
1191     }
1192
1193   if (BE (set->nelem, 0) == 0)
1194     {
1195       /* We already guaranteed above that set->alloc != 0.  */
1196       set->elems[0] = elem;
1197       ++set->nelem;
1198       return 1;
1199     }
1200
1201   /* Realloc if we need.  */
1202   if (set->alloc == set->nelem)
1203     {
1204       Idx *new_elems;
1205       set->alloc = set->alloc * 2;
1206       new_elems = re_realloc (set->elems, Idx, set->alloc);
1207       if (BE (new_elems == NULL, 0))
1208         return -1;
1209       set->elems = new_elems;
1210     }
1211
1212   /* Move the elements which follows the new element.  Test the
1213      first element separately to skip a check in the inner loop.  */
1214   if (elem < set->elems[0])
1215     {
1216       idx = 0;
1217       for (idx = set->nelem; idx > 0; idx--)
1218         set->elems[idx] = set->elems[idx - 1];
1219     }
1220   else
1221     {
1222       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1223         set->elems[idx] = set->elems[idx - 1];
1224     }
1225
1226   /* Insert the new element.  */
1227   set->elems[idx] = elem;
1228   ++set->nelem;
1229   return 1;
1230 }
1231
1232 /* Insert the new element ELEM to the re_node_set* SET.
1233    SET should not already have any element greater than or equal to ELEM.
1234    Return -1 if an error is occured, return 1 otherwise.  */
1235
1236 static int
1237 internal_function
1238 re_node_set_insert_last (re_node_set *set, Idx elem)
1239 {
1240   /* Realloc if we need.  */
1241   if (set->alloc == set->nelem)
1242     {
1243       Idx *new_elems;
1244       set->alloc = (set->alloc + 1) * 2;
1245       new_elems = re_realloc (set->elems, Idx, set->alloc);
1246       if (BE (new_elems == NULL, 0))
1247         return -1;
1248       set->elems = new_elems;
1249     }
1250
1251   /* Insert the new element.  */
1252   set->elems[set->nelem++] = elem;
1253   return 1;
1254 }
1255
1256 /* Compare two node sets SET1 and SET2.
1257    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1258
1259 static int
1260 internal_function __attribute ((pure))
1261 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1262 {
1263   Idx i;
1264   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1265     return 0;
1266   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1267     if (set1->elems[i] != set2->elems[i])
1268       return 0;
1269   return 1;
1270 }
1271
1272 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1273
1274 static Idx
1275 internal_function __attribute ((pure))
1276 re_node_set_contains (const re_node_set *set, Idx elem)
1277 {
1278   __re_size_t idx, right, mid;
1279   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1280     return 0;
1281
1282   /* Binary search the element.  */
1283   idx = 0;
1284   right = set->nelem - 1;
1285   while (idx < right)
1286     {
1287       mid = (idx + right) / 2;
1288       if (set->elems[mid] < elem)
1289         idx = mid + 1;
1290       else
1291         right = mid;
1292     }
1293   return set->elems[idx] == elem ? idx + 1 : 0;
1294 }
1295
1296 static void
1297 internal_function
1298 re_node_set_remove_at (re_node_set *set, Idx idx)
1299 {
1300   if (idx < 0 || idx >= set->nelem)
1301     return;
1302   --set->nelem;
1303   for (; idx < set->nelem; idx++)
1304     set->elems[idx] = set->elems[idx + 1];
1305 }
1306 \f
1307
1308 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1309    Or return REG_MISSING if an error occurred.  */
1310
1311 static Idx
1312 internal_function
1313 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1314 {
1315   int type = token.type;
1316   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1317     {
1318       Idx new_nodes_alloc = dfa->nodes_alloc * 2;
1319       Idx *new_nexts, *new_indices;
1320       re_node_set *new_edests, *new_eclosures;
1321
1322       re_token_t *new_nodes = re_realloc (dfa->nodes, re_token_t,
1323                                           new_nodes_alloc);
1324       if (BE (new_nodes == NULL, 0))
1325         return REG_MISSING;
1326       dfa->nodes = new_nodes;
1327       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1328       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1329       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1330       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1331       if (BE (new_nexts == NULL || new_indices == NULL
1332               || new_edests == NULL || new_eclosures == NULL, 0))
1333         return REG_MISSING;
1334       dfa->nexts = new_nexts;
1335       dfa->org_indices = new_indices;
1336       dfa->edests = new_edests;
1337       dfa->eclosures = new_eclosures;
1338       dfa->nodes_alloc = new_nodes_alloc;
1339     }
1340   dfa->nodes[dfa->nodes_len] = token;
1341   dfa->nodes[dfa->nodes_len].constraint = 0;
1342 #ifdef RE_ENABLE_I18N
1343   dfa->nodes[dfa->nodes_len].accept_mb =
1344     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1345 #endif
1346   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1347   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1348   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1349   return dfa->nodes_len++;
1350 }
1351
1352 static inline re_hashval_t
1353 internal_function
1354 calc_state_hash (const re_node_set *nodes, unsigned int context)
1355 {
1356   re_hashval_t hash = nodes->nelem + context;
1357   Idx i;
1358   for (i = 0 ; i < nodes->nelem ; i++)
1359     hash += nodes->elems[i];
1360   return hash;
1361 }
1362
1363 /* Search for the state whose node_set is equivalent to NODES.
1364    Return the pointer to the state, if we found it in the DFA.
1365    Otherwise create the new one and return it.  In case of an error
1366    return NULL and set the error code in ERR.
1367    Note: - We assume NULL as the invalid state, then it is possible that
1368            return value is NULL and ERR is REG_NOERROR.
1369          - We never return non-NULL value in case of any errors, it is for
1370            optimization.  */
1371
1372 static re_dfastate_t*
1373 internal_function
1374 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1375 {
1376   re_hashval_t hash;
1377   re_dfastate_t *new_state;
1378   struct re_state_table_entry *spot;
1379   Idx i;
1380 #ifdef lint
1381   /* Suppress bogus uninitialized-variable warnings.  */
1382   *err = REG_NOERROR;
1383 #endif
1384   if (BE (nodes->nelem == 0, 0))
1385     {
1386       *err = REG_NOERROR;
1387       return NULL;
1388     }
1389   hash = calc_state_hash (nodes, 0);
1390   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1391
1392   for (i = 0 ; i < spot->num ; i++)
1393     {
1394       re_dfastate_t *state = spot->array[i];
1395       if (hash != state->hash)
1396         continue;
1397       if (re_node_set_compare (&state->nodes, nodes))
1398         return state;
1399     }
1400
1401   /* There are no appropriate state in the dfa, create the new one.  */
1402   new_state = create_ci_newstate (dfa, nodes, hash);
1403   if (BE (new_state != NULL, 1))
1404     return new_state;
1405   else
1406     {
1407       *err = REG_ESPACE;
1408       return NULL;
1409     }
1410 }
1411
1412 /* Search for the state whose node_set is equivalent to NODES and
1413    whose context is equivalent to CONTEXT.
1414    Return the pointer to the state, if we found it in the DFA.
1415    Otherwise create the new one and return it.  In case of an error
1416    return NULL and set the error code in ERR.
1417    Note: - We assume NULL as the invalid state, then it is possible that
1418            return value is NULL and ERR is REG_NOERROR.
1419          - We never return non-NULL value in case of any errors, it is for
1420            optimization.  */
1421
1422 static re_dfastate_t*
1423 internal_function
1424 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1425                           const re_node_set *nodes, unsigned int context)
1426 {
1427   re_hashval_t hash;
1428   re_dfastate_t *new_state;
1429   struct re_state_table_entry *spot;
1430   Idx i;
1431 #ifdef lint
1432   /* Suppress bogus uninitialized-variable warnings.  */
1433   *err = REG_NOERROR;
1434 #endif
1435   if (nodes->nelem == 0)
1436     {
1437       *err = REG_NOERROR;
1438       return NULL;
1439     }
1440   hash = calc_state_hash (nodes, context);
1441   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1442
1443   for (i = 0 ; i < spot->num ; i++)
1444     {
1445       re_dfastate_t *state = spot->array[i];
1446       if (state->hash == hash
1447           && state->context == context
1448           && re_node_set_compare (state->entrance_nodes, nodes))
1449         return state;
1450     }
1451   /* There are no appropriate state in `dfa', create the new one.  */
1452   new_state = create_cd_newstate (dfa, nodes, context, hash);
1453   if (BE (new_state != NULL, 1))
1454     return new_state;
1455   else
1456     {
1457       *err = REG_ESPACE;
1458       return NULL;
1459     }
1460 }
1461
1462 /* Finish initialization of the new state NEWSTATE, and using its hash value
1463    HASH put in the appropriate bucket of DFA's state table.  Return value
1464    indicates the error code if failed.  */
1465
1466 static reg_errcode_t
1467 internal_function
1468 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1469 {
1470   struct re_state_table_entry *spot;
1471   reg_errcode_t err;
1472   Idx i;
1473
1474   newstate->hash = hash;
1475   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1476   if (BE (err != REG_NOERROR, 0))
1477     return REG_ESPACE;
1478   for (i = 0; i < newstate->nodes.nelem; i++)
1479     {
1480       Idx elem = newstate->nodes.elems[i];
1481       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1482         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1483     }
1484
1485   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1486   if (BE (spot->alloc <= spot->num, 0))
1487     {
1488       Idx new_alloc = 2 * spot->num + 2;
1489       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1490                                               new_alloc);
1491       if (BE (new_array == NULL, 0))
1492         return REG_ESPACE;
1493       spot->array = new_array;
1494       spot->alloc = new_alloc;
1495     }
1496   spot->array[spot->num++] = newstate;
1497   return REG_NOERROR;
1498 }
1499
1500 /* Create the new state which is independ of contexts.
1501    Return the new state if succeeded, otherwise return NULL.  */
1502
1503 static re_dfastate_t *
1504 internal_function
1505 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1506                     re_hashval_t hash)
1507 {
1508   Idx i;
1509   reg_errcode_t err;
1510   re_dfastate_t *newstate;
1511
1512   newstate = re_calloc (re_dfastate_t, 1);
1513   if (BE (newstate == NULL, 0))
1514     return NULL;
1515   err = re_node_set_init_copy (&newstate->nodes, nodes);
1516   if (BE (err != REG_NOERROR, 0))
1517     {
1518       re_free (newstate);
1519       return NULL;
1520     }
1521
1522   newstate->entrance_nodes = &newstate->nodes;
1523   for (i = 0 ; i < nodes->nelem ; i++)
1524     {
1525       re_token_t *node = dfa->nodes + nodes->elems[i];
1526       re_token_type_t type = node->type;
1527       if (type == CHARACTER && !node->constraint)
1528         continue;
1529 #ifdef RE_ENABLE_I18N
1530       newstate->accept_mb |= node->accept_mb;
1531 #endif /* RE_ENABLE_I18N */
1532
1533       /* If the state has the halt node, the state is a halt state.  */
1534       if (type == END_OF_RE)
1535         newstate->halt = 1;
1536       else if (type == OP_BACK_REF)
1537         newstate->has_backref = 1;
1538       else if (type == ANCHOR || node->constraint)
1539         newstate->has_constraint = 1;
1540     }
1541   err = register_state (dfa, newstate, hash);
1542   if (BE (err != REG_NOERROR, 0))
1543     {
1544       free_state (newstate);
1545       newstate = NULL;
1546     }
1547   return newstate;
1548 }
1549
1550 /* Create the new state which is depend on the context CONTEXT.
1551    Return the new state if succeeded, otherwise return NULL.  */
1552
1553 static re_dfastate_t *
1554 internal_function
1555 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1556                     unsigned int context, re_hashval_t hash)
1557 {
1558   Idx i, nctx_nodes = 0;
1559   reg_errcode_t err;
1560   re_dfastate_t *newstate;
1561
1562   newstate = re_calloc (re_dfastate_t, 1);
1563   if (BE (newstate == NULL, 0))
1564     return NULL;
1565   err = re_node_set_init_copy (&newstate->nodes, nodes);
1566   if (BE (err != REG_NOERROR, 0))
1567     {
1568       re_free (newstate);
1569       return NULL;
1570     }
1571
1572   newstate->context = context;
1573   newstate->entrance_nodes = &newstate->nodes;
1574
1575   for (i = 0 ; i < nodes->nelem ; i++)
1576     {
1577       unsigned int constraint = 0;
1578       re_token_t *node = dfa->nodes + nodes->elems[i];
1579       re_token_type_t type = node->type;
1580       if (node->constraint)
1581         constraint = node->constraint;
1582
1583       if (type == CHARACTER && !constraint)
1584         continue;
1585 #ifdef RE_ENABLE_I18N
1586       newstate->accept_mb |= node->accept_mb;
1587 #endif /* RE_ENABLE_I18N */
1588
1589       /* If the state has the halt node, the state is a halt state.  */
1590       if (type == END_OF_RE)
1591         newstate->halt = 1;
1592       else if (type == OP_BACK_REF)
1593         newstate->has_backref = 1;
1594       else if (type == ANCHOR)
1595         constraint = node->opr.ctx_type;
1596
1597       if (constraint)
1598         {
1599           if (newstate->entrance_nodes == &newstate->nodes)
1600             {
1601               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1602               if (BE (newstate->entrance_nodes == NULL, 0))
1603                 {
1604                   free_state (newstate);
1605                   return NULL;
1606                 }
1607               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1608               nctx_nodes = 0;
1609               newstate->has_constraint = 1;
1610             }
1611
1612           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1613             {
1614               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1615               ++nctx_nodes;
1616             }
1617         }
1618     }
1619   err = register_state (dfa, newstate, hash);
1620   if (BE (err != REG_NOERROR, 0))
1621     {
1622       free_state (newstate);
1623       newstate = NULL;
1624     }
1625   return  newstate;
1626 }
1627
1628 static void
1629 internal_function
1630 free_state (re_dfastate_t *state)
1631 {
1632   re_node_set_free (&state->non_eps_nodes);
1633   re_node_set_free (&state->inveclosure);
1634   if (state->entrance_nodes != &state->nodes)
1635     {
1636       re_node_set_free (state->entrance_nodes);
1637       re_free (state->entrance_nodes);
1638     }
1639   re_node_set_free (&state->nodes);
1640   re_free (state->word_trtable);
1641   re_free (state->trtable);
1642   re_free (state);
1643 }