1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 # if defined __GNUC__ && defined __GNUC_MINOR__
22 # define __GNUC_PREREQ(maj, min) \
23 ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
25 # define __GNUC_PREREQ(maj, min) 0
29 #if !__GNUC_PREREQ (3, 0)
33 static void re_string_construct_common (const char *str, Idx len,
35 REG_TRANSLATE_TYPE trans, bool icase,
36 const re_dfa_t *dfa) internal_function;
37 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
38 const re_node_set *nodes,
39 re_hashval_t hash) internal_function;
40 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
41 const re_node_set *nodes,
43 re_hashval_t hash) internal_function;
45 /* Functions for string operation. */
47 /* This function allocate the buffers. It is necessary to call
48 re_string_reconstruct before using the object. */
52 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
53 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
58 /* Ensure at least one character fits into the buffers. */
59 if (init_len < dfa->mb_cur_max)
60 init_len = dfa->mb_cur_max;
61 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
62 re_string_construct_common (str, len, pstr, trans, icase, dfa);
64 ret = re_string_realloc_buffers (pstr, init_buf_len);
65 if (BE (ret != REG_NOERROR, 0))
68 pstr->word_char = dfa->word_char;
69 pstr->word_ops_used = dfa->word_ops_used;
70 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
71 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
72 pstr->valid_raw_len = pstr->valid_len;
76 /* This function allocate the buffers, and initialize them. */
80 re_string_construct (re_string_t *pstr, const char *str, Idx len,
81 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
84 memset (pstr, '\0', sizeof (re_string_t));
85 re_string_construct_common (str, len, pstr, trans, icase, dfa);
89 ret = re_string_realloc_buffers (pstr, len + 1);
90 if (BE (ret != REG_NOERROR, 0))
93 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
98 if (dfa->mb_cur_max > 1)
102 ret = build_wcs_upper_buffer (pstr);
103 if (BE (ret != REG_NOERROR, 0))
105 if (pstr->valid_raw_len >= len)
107 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
109 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
110 if (BE (ret != REG_NOERROR, 0))
115 #endif /* RE_ENABLE_I18N */
116 build_upper_buffer (pstr);
120 #ifdef RE_ENABLE_I18N
121 if (dfa->mb_cur_max > 1)
122 build_wcs_buffer (pstr);
124 #endif /* RE_ENABLE_I18N */
127 re_string_translate_buffer (pstr);
130 pstr->valid_len = pstr->bufs_len;
131 pstr->valid_raw_len = pstr->bufs_len;
139 /* Helper functions for re_string_allocate, and re_string_construct. */
143 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
145 #ifdef RE_ENABLE_I18N
146 if (pstr->mb_cur_max > 1)
148 wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
149 if (BE (new_wcs == NULL, 0))
152 if (pstr->offsets != NULL)
154 Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
155 if (BE (new_offsets == NULL, 0))
157 pstr->offsets = new_offsets;
160 #endif /* RE_ENABLE_I18N */
161 if (pstr->mbs_allocated)
163 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
165 if (BE (new_mbs == NULL, 0))
169 pstr->bufs_len = new_buf_len;
176 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
177 REG_TRANSLATE_TYPE trans, bool icase,
180 pstr->raw_mbs = (const unsigned char *) str;
183 pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
185 pstr->mbs_allocated = (trans != NULL || icase);
186 pstr->mb_cur_max = dfa->mb_cur_max;
187 pstr->is_utf8 = dfa->is_utf8;
188 pstr->map_notascii = dfa->map_notascii;
189 pstr->stop = pstr->len;
190 pstr->raw_stop = pstr->stop;
193 #ifdef RE_ENABLE_I18N
195 /* Build wide character buffer PSTR->WCS.
196 If the byte sequence of the string are:
197 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
198 Then wide character buffer will be:
199 <wc1> , WEOF , <wc2> , WEOF , <wc3>
200 We use WEOF for padding, they indicate that the position isn't
201 a first byte of a multibyte character.
203 Note that this function assumes PSTR->VALID_LEN elements are already
204 built and starts from PSTR->VALID_LEN. */
208 build_wcs_buffer (re_string_t *pstr)
211 unsigned char buf[MB_LEN_MAX];
212 assert (MB_LEN_MAX >= pstr->mb_cur_max);
214 unsigned char buf[64];
217 Idx byte_idx, end_idx, remain_len;
220 /* Build the buffers from pstr->valid_len to either pstr->len or
222 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
223 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
228 remain_len = end_idx - byte_idx;
229 prev_st = pstr->cur_state;
230 /* Apply the translation if we need. */
231 if (BE (pstr->trans != NULL, 0))
235 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
237 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
238 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
240 p = (const char *) buf;
243 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
244 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
245 if (BE (mbclen == (size_t) -2, 0))
247 /* The buffer doesn't have enough space, finish to build. */
248 pstr->cur_state = prev_st;
251 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
253 /* We treat these cases as a singlebyte character. */
255 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
256 if (BE (pstr->trans != NULL, 0))
257 wc = pstr->trans[wc];
258 pstr->cur_state = prev_st;
261 /* Write wide character and padding. */
262 pstr->wcs[byte_idx++] = wc;
263 /* Write paddings. */
264 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
265 pstr->wcs[byte_idx++] = WEOF;
267 pstr->valid_len = byte_idx;
268 pstr->valid_raw_len = byte_idx;
271 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
272 but for REG_ICASE. */
276 build_wcs_upper_buffer (re_string_t *pstr)
279 Idx src_idx, byte_idx, end_idx, remain_len;
282 char buf[MB_LEN_MAX];
283 assert (MB_LEN_MAX >= pstr->mb_cur_max);
288 byte_idx = pstr->valid_len;
289 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
291 /* The following optimization assumes that ASCII characters can be
292 mapped to wide characters with a simple cast. */
293 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
295 while (byte_idx < end_idx)
299 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
300 && mbsinit (&pstr->cur_state))
302 /* In case of a singlebyte character. */
304 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
305 /* The next step uses the assumption that wchar_t is encoded
306 ASCII-safe: all ASCII values can be converted like this. */
307 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
312 remain_len = end_idx - byte_idx;
313 prev_st = pstr->cur_state;
314 mbclen = mbrtowc (&wc,
315 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
316 + byte_idx), remain_len, &pstr->cur_state);
317 if (BE ((size_t) (mbclen + 2) > 2, 1))
325 mbcdlen = wcrtomb (buf, wcu, &prev_st);
326 if (BE (mbclen == mbcdlen, 1))
327 memcpy (pstr->mbs + byte_idx, buf, mbclen);
335 memcpy (pstr->mbs + byte_idx,
336 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
337 pstr->wcs[byte_idx++] = wcu;
338 /* Write paddings. */
339 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
340 pstr->wcs[byte_idx++] = WEOF;
342 else if (mbclen == (size_t) -1 || mbclen == 0)
344 /* It is an invalid character or '\0'. Just use the byte. */
345 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
346 pstr->mbs[byte_idx] = ch;
347 /* And also cast it to wide char. */
348 pstr->wcs[byte_idx++] = (wchar_t) ch;
349 if (BE (mbclen == (size_t) -1, 0))
350 pstr->cur_state = prev_st;
354 /* The buffer doesn't have enough space, finish to build. */
355 pstr->cur_state = prev_st;
359 pstr->valid_len = byte_idx;
360 pstr->valid_raw_len = byte_idx;
364 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
369 remain_len = end_idx - byte_idx;
370 prev_st = pstr->cur_state;
371 if (BE (pstr->trans != NULL, 0))
375 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
377 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
378 buf[i] = pstr->trans[ch];
380 p = (const char *) buf;
383 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
384 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
385 if (BE ((size_t) (mbclen + 2) > 2, 1))
393 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
394 if (BE (mbclen == mbcdlen, 1))
395 memcpy (pstr->mbs + byte_idx, buf, mbclen);
396 else if (mbcdlen != (size_t) -1)
400 if (byte_idx + mbcdlen > pstr->bufs_len)
402 pstr->cur_state = prev_st;
406 if (pstr->offsets == NULL)
408 pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
410 if (pstr->offsets == NULL)
413 if (!pstr->offsets_needed)
415 for (i = 0; i < (size_t) byte_idx; ++i)
416 pstr->offsets[i] = i;
417 pstr->offsets_needed = 1;
420 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
421 pstr->wcs[byte_idx] = wcu;
422 pstr->offsets[byte_idx] = src_idx;
423 for (i = 1; i < mbcdlen; ++i)
425 pstr->offsets[byte_idx + i]
426 = src_idx + (i < mbclen ? i : mbclen - 1);
427 pstr->wcs[byte_idx + i] = WEOF;
429 pstr->len += mbcdlen - mbclen;
430 if (pstr->raw_stop > src_idx)
431 pstr->stop += mbcdlen - mbclen;
432 end_idx = (pstr->bufs_len > pstr->len)
433 ? pstr->len : pstr->bufs_len;
439 memcpy (pstr->mbs + byte_idx, p, mbclen);
442 memcpy (pstr->mbs + byte_idx, p, mbclen);
444 if (BE (pstr->offsets_needed != 0, 0))
447 for (i = 0; i < mbclen; ++i)
448 pstr->offsets[byte_idx + i] = src_idx + i;
452 pstr->wcs[byte_idx++] = wcu;
453 /* Write paddings. */
454 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
455 pstr->wcs[byte_idx++] = WEOF;
457 else if (mbclen == (size_t) -1 || mbclen == 0)
459 /* It is an invalid character or '\0'. Just use the byte. */
460 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
462 if (BE (pstr->trans != NULL, 0))
463 ch = pstr->trans [ch];
464 pstr->mbs[byte_idx] = ch;
466 if (BE (pstr->offsets_needed != 0, 0))
467 pstr->offsets[byte_idx] = src_idx;
470 /* And also cast it to wide char. */
471 pstr->wcs[byte_idx++] = (wchar_t) ch;
472 if (BE (mbclen == (size_t) -1, 0))
473 pstr->cur_state = prev_st;
477 /* The buffer doesn't have enough space, finish to build. */
478 pstr->cur_state = prev_st;
482 pstr->valid_len = byte_idx;
483 pstr->valid_raw_len = src_idx;
487 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
492 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
499 /* Skip the characters which are not necessary to check. */
500 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
501 rawbuf_idx < new_raw_idx;)
504 remain_len = pstr->len - rawbuf_idx;
505 prev_st = pstr->cur_state;
506 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
507 remain_len, &pstr->cur_state);
508 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
510 /* We treat these cases as a singlebyte character. */
512 pstr->cur_state = prev_st;
514 /* Then proceed the next character. */
515 rawbuf_idx += mbclen;
517 *last_wc = (wint_t) wc;
520 #endif /* RE_ENABLE_I18N */
522 /* Build the buffer PSTR->MBS, and apply the translation if we need.
523 This function is used in case of REG_ICASE. */
527 build_upper_buffer (re_string_t *pstr)
529 Idx char_idx, end_idx;
530 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
532 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
534 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
535 if (BE (pstr->trans != NULL, 0))
536 ch = pstr->trans[ch];
538 pstr->mbs[char_idx] = toupper (ch);
540 pstr->mbs[char_idx] = ch;
542 pstr->valid_len = char_idx;
543 pstr->valid_raw_len = char_idx;
546 /* Apply TRANS to the buffer in PSTR. */
550 re_string_translate_buffer (re_string_t *pstr)
552 Idx buf_idx, end_idx;
553 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
555 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
557 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
558 pstr->mbs[buf_idx] = pstr->trans[ch];
561 pstr->valid_len = buf_idx;
562 pstr->valid_raw_len = buf_idx;
565 /* This function re-construct the buffers.
566 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
567 convert to upper case in case of REG_ICASE, apply translation. */
571 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
575 if (BE (pstr->raw_mbs_idx <= idx, 0))
576 offset = idx - pstr->raw_mbs_idx;
580 #ifdef RE_ENABLE_I18N
581 if (pstr->mb_cur_max > 1)
582 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
583 #endif /* RE_ENABLE_I18N */
584 pstr->len = pstr->raw_len;
585 pstr->stop = pstr->raw_stop;
587 pstr->raw_mbs_idx = 0;
588 pstr->valid_raw_len = 0;
589 pstr->offsets_needed = 0;
590 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
591 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
592 if (!pstr->mbs_allocated)
593 pstr->mbs = (unsigned char *) pstr->raw_mbs;
597 if (BE (offset != 0, 1))
599 /* Are the characters which are already checked remain? */
600 if (BE (offset < pstr->valid_raw_len, 1)
601 #ifdef RE_ENABLE_I18N
602 /* Handling this would enlarge the code too much.
603 Accept a slowdown in that case. */
604 && pstr->offsets_needed == 0
608 /* Yes, move them to the front of the buffer. */
609 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
610 #ifdef RE_ENABLE_I18N
611 if (pstr->mb_cur_max > 1)
612 memmove (pstr->wcs, pstr->wcs + offset,
613 (pstr->valid_len - offset) * sizeof (wint_t));
614 #endif /* RE_ENABLE_I18N */
615 if (BE (pstr->mbs_allocated, 0))
616 memmove (pstr->mbs, pstr->mbs + offset,
617 pstr->valid_len - offset);
618 pstr->valid_len -= offset;
619 pstr->valid_raw_len -= offset;
621 assert (pstr->valid_len > 0);
626 /* No, skip all characters until IDX. */
627 #ifdef RE_ENABLE_I18N
628 if (BE (pstr->offsets_needed, 0))
630 pstr->len = pstr->raw_len - idx + offset;
631 pstr->stop = pstr->raw_stop - idx + offset;
632 pstr->offsets_needed = 0;
636 pstr->valid_raw_len = 0;
637 #ifdef RE_ENABLE_I18N
638 if (pstr->mb_cur_max > 1)
645 const unsigned char *raw, *p, *q, *end;
647 /* Special case UTF-8. Multi-byte chars start with any
648 byte other than 0x80 - 0xbf. */
649 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
650 end = raw + (offset - pstr->mb_cur_max);
651 for (p = raw + offset - 1; p >= end; --p)
652 if ((*p & 0xc0) != 0x80)
656 Idx mlen = raw + pstr->len - p;
657 unsigned char buf[6];
661 if (BE (pstr->trans != NULL, 0))
663 int i = mlen < 6 ? mlen : 6;
665 buf[i] = pstr->trans[p[i]];
668 /* XXX Don't use mbrtowc, we know which conversion
669 to use (UTF-8 -> UCS4). */
670 memset (&cur_state, 0, sizeof (cur_state));
671 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
673 if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
675 memset (&pstr->cur_state, '\0',
677 pstr->valid_len = mbclen - (raw + offset - p);
685 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
686 if (BE (pstr->valid_len, 0))
688 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
689 pstr->wcs[wcs_idx] = WEOF;
690 if (pstr->mbs_allocated)
691 memset (pstr->mbs, -1, pstr->valid_len);
693 pstr->valid_raw_len = pstr->valid_len;
694 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
695 && IS_WIDE_WORD_CHAR (wc))
697 : ((IS_WIDE_NEWLINE (wc)
698 && pstr->newline_anchor)
699 ? CONTEXT_NEWLINE : 0));
702 #endif /* RE_ENABLE_I18N */
704 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
707 pstr->tip_context = (bitset_contain (pstr->word_char, c)
709 : ((IS_NEWLINE (c) && pstr->newline_anchor)
710 ? CONTEXT_NEWLINE : 0));
713 if (!BE (pstr->mbs_allocated, 0))
716 pstr->raw_mbs_idx = idx;
718 pstr->stop -= offset;
720 /* Then build the buffers. */
721 #ifdef RE_ENABLE_I18N
722 if (pstr->mb_cur_max > 1)
726 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
727 if (BE (ret != REG_NOERROR, 0))
731 build_wcs_buffer (pstr);
734 #endif /* RE_ENABLE_I18N */
735 if (BE (pstr->mbs_allocated, 0))
738 build_upper_buffer (pstr);
739 else if (pstr->trans != NULL)
740 re_string_translate_buffer (pstr);
743 pstr->valid_len = pstr->len;
750 internal_function __attribute ((pure))
751 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
756 /* Handle the common (easiest) cases first. */
757 if (BE (!pstr->mbs_allocated, 1))
758 return re_string_peek_byte (pstr, idx);
760 #ifdef RE_ENABLE_I18N
761 if (pstr->mb_cur_max > 1
762 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
763 return re_string_peek_byte (pstr, idx);
766 off = pstr->cur_idx + idx;
767 #ifdef RE_ENABLE_I18N
768 if (pstr->offsets_needed)
769 off = pstr->offsets[off];
772 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
774 #ifdef RE_ENABLE_I18N
775 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
776 this function returns CAPITAL LETTER I instead of first byte of
777 DOTLESS SMALL LETTER I. The latter would confuse the parser,
778 since peek_byte_case doesn't advance cur_idx in any way. */
779 if (pstr->offsets_needed && !isascii (ch))
780 return re_string_peek_byte (pstr, idx);
787 internal_function __attribute ((pure))
788 re_string_fetch_byte_case (re_string_t *pstr)
790 if (BE (!pstr->mbs_allocated, 1))
791 return re_string_fetch_byte (pstr);
793 #ifdef RE_ENABLE_I18N
794 if (pstr->offsets_needed)
799 /* For tr_TR.UTF-8 [[:islower:]] there is
800 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
801 in that case the whole multi-byte character and return
802 the original letter. On the other side, with
803 [[: DOTLESS SMALL LETTER I return [[:I, as doing
804 anything else would complicate things too much. */
806 if (!re_string_first_byte (pstr, pstr->cur_idx))
807 return re_string_fetch_byte (pstr);
809 off = pstr->offsets[pstr->cur_idx];
810 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
813 return re_string_fetch_byte (pstr);
815 re_string_skip_bytes (pstr,
816 re_string_char_size_at (pstr, pstr->cur_idx));
821 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
826 re_string_destruct (re_string_t *pstr)
828 #ifdef RE_ENABLE_I18N
830 re_free (pstr->offsets);
831 #endif /* RE_ENABLE_I18N */
832 if (pstr->mbs_allocated)
836 /* Return the context at IDX in INPUT. */
840 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
843 if (BE (! REG_VALID_INDEX (idx), 0))
844 /* In this case, we use the value stored in input->tip_context,
845 since we can't know the character in input->mbs[-1] here. */
846 return input->tip_context;
847 if (BE (idx == input->len, 0))
848 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
849 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
850 #ifdef RE_ENABLE_I18N
851 if (input->mb_cur_max > 1)
855 while(input->wcs[wc_idx] == WEOF)
858 /* It must not happen. */
859 assert (REG_VALID_INDEX (wc_idx));
862 if (! REG_VALID_INDEX (wc_idx))
863 return input->tip_context;
865 wc = input->wcs[wc_idx];
866 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
868 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
869 ? CONTEXT_NEWLINE : 0);
874 c = re_string_byte_at (input, idx);
875 if (bitset_contain (input->word_char, c))
877 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
881 /* Functions for set operation. */
885 re_node_set_alloc (re_node_set *set, Idx size)
889 set->elems = re_xmalloc (Idx, size);
890 if (BE (set->elems == NULL, 0))
897 re_node_set_init_1 (re_node_set *set, Idx elem)
901 set->elems = re_malloc (Idx, 1);
902 if (BE (set->elems == NULL, 0))
904 set->alloc = set->nelem = 0;
907 set->elems[0] = elem;
913 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
916 set->elems = re_malloc (Idx, 2);
917 if (BE (set->elems == NULL, 0))
922 set->elems[0] = elem1;
929 set->elems[0] = elem1;
930 set->elems[1] = elem2;
934 set->elems[0] = elem2;
935 set->elems[1] = elem1;
943 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
945 dest->nelem = src->nelem;
948 dest->alloc = dest->nelem;
949 dest->elems = re_malloc (Idx, dest->alloc);
950 if (BE (dest->elems == NULL, 0))
952 dest->alloc = dest->nelem = 0;
955 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
958 re_node_set_init_empty (dest);
962 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
963 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
964 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
968 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
969 const re_node_set *src2)
971 Idx i1, i2, is, id, delta, sbase;
972 if (src1->nelem == 0 || src2->nelem == 0)
975 /* We need dest->nelem + 2 * elems_in_intersection; this is a
976 conservative estimate. */
977 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
979 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
982 && (new_alloc < dest->alloc
983 || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
985 new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
986 if (BE (new_elems == NULL, 0))
988 dest->elems = new_elems;
989 dest->alloc = new_alloc;
992 /* Find the items in the intersection of SRC1 and SRC2, and copy
993 into the top of DEST those that are not already in DEST itself. */
994 sbase = dest->nelem + src1->nelem + src2->nelem;
995 i1 = src1->nelem - 1;
996 i2 = src2->nelem - 1;
997 id = dest->nelem - 1;
1000 if (src1->elems[i1] == src2->elems[i2])
1002 /* Try to find the item in DEST. Maybe we could binary search? */
1003 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1006 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1007 dest->elems[--sbase] = src1->elems[i1];
1009 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1013 /* Lower the highest of the two items. */
1014 else if (src1->elems[i1] < src2->elems[i2])
1016 if (! REG_VALID_INDEX (--i2))
1021 if (! REG_VALID_INDEX (--i1))
1026 id = dest->nelem - 1;
1027 is = dest->nelem + src1->nelem + src2->nelem - 1;
1028 delta = is - sbase + 1;
1030 /* Now copy. When DELTA becomes zero, the remaining
1031 DEST elements are already in place; this is more or
1032 less the same loop that is in re_node_set_merge. */
1033 dest->nelem += delta;
1034 if (delta > 0 && REG_VALID_INDEX (id))
1037 if (dest->elems[is] > dest->elems[id])
1039 /* Copy from the top. */
1040 dest->elems[id + delta--] = dest->elems[is--];
1046 /* Slide from the bottom. */
1047 dest->elems[id + delta] = dest->elems[id];
1048 if (! REG_VALID_INDEX (--id))
1053 /* Copy remaining SRC elements. */
1054 memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1059 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1060 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1062 static reg_errcode_t
1064 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1065 const re_node_set *src2)
1068 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1070 dest->alloc = src1->nelem + src2->nelem;
1071 if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1073 dest->elems = re_xmalloc (Idx, dest->alloc);
1074 if (BE (dest->elems == NULL, 0))
1079 if (src1 != NULL && src1->nelem > 0)
1080 return re_node_set_init_copy (dest, src1);
1081 else if (src2 != NULL && src2->nelem > 0)
1082 return re_node_set_init_copy (dest, src2);
1084 re_node_set_init_empty (dest);
1087 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1089 if (src1->elems[i1] > src2->elems[i2])
1091 dest->elems[id++] = src2->elems[i2++];
1094 if (src1->elems[i1] == src2->elems[i2])
1096 dest->elems[id++] = src1->elems[i1++];
1098 if (i1 < src1->nelem)
1100 memcpy (dest->elems + id, src1->elems + i1,
1101 (src1->nelem - i1) * sizeof dest->elems[0]);
1102 id += src1->nelem - i1;
1104 else if (i2 < src2->nelem)
1106 memcpy (dest->elems + id, src2->elems + i2,
1107 (src2->nelem - i2) * sizeof dest->elems[0]);
1108 id += src2->nelem - i2;
1114 /* Calculate the union set of the sets DEST and SRC. And store it to
1115 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1117 static reg_errcode_t
1119 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1121 Idx is, id, sbase, delta;
1122 if (src == NULL || src->nelem == 0)
1124 if (sizeof (Idx) < 3
1125 && ((Idx) (2 * src->nelem) < src->nelem
1126 || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1128 if (dest->alloc < 2 * src->nelem + dest->nelem)
1130 Idx new_alloc = src->nelem + dest->alloc;
1132 if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1134 new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1135 if (BE (new_buffer == NULL, 0))
1137 dest->elems = new_buffer;
1138 dest->alloc = new_alloc;
1141 if (BE (dest->nelem == 0, 0))
1143 dest->nelem = src->nelem;
1144 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1148 /* Copy into the top of DEST the items of SRC that are not
1149 found in DEST. Maybe we could binary search in DEST? */
1150 for (sbase = dest->nelem + 2 * src->nelem,
1151 is = src->nelem - 1, id = dest->nelem - 1;
1152 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1154 if (dest->elems[id] == src->elems[is])
1156 else if (dest->elems[id] < src->elems[is])
1157 dest->elems[--sbase] = src->elems[is--];
1158 else /* if (dest->elems[id] > src->elems[is]) */
1162 if (REG_VALID_INDEX (is))
1164 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1166 memcpy (dest->elems + sbase, src->elems,
1167 (is + 1) * sizeof dest->elems[0]);
1170 id = dest->nelem - 1;
1171 is = dest->nelem + 2 * src->nelem - 1;
1172 delta = is - sbase + 1;
1176 /* Now copy. When DELTA becomes zero, the remaining
1177 DEST elements are already in place. */
1178 dest->nelem += delta;
1181 if (dest->elems[is] > dest->elems[id])
1183 /* Copy from the top. */
1184 dest->elems[id + delta--] = dest->elems[is--];
1190 /* Slide from the bottom. */
1191 dest->elems[id + delta] = dest->elems[id];
1192 if (! REG_VALID_INDEX (--id))
1194 /* Copy remaining SRC elements. */
1195 memcpy (dest->elems, dest->elems + sbase,
1196 delta * sizeof dest->elems[0]);
1205 /* Insert the new element ELEM to the re_node_set* SET.
1206 SET should not already have ELEM.
1207 Return true if successful. */
1211 re_node_set_insert (re_node_set *set, Idx elem)
1214 /* In case the set is empty. */
1215 if (set->alloc == 0)
1216 return re_node_set_init_1 (set, elem) == REG_NOERROR;
1218 if (BE (set->nelem, 0) == 0)
1220 /* We already guaranteed above that set->alloc != 0. */
1221 set->elems[0] = elem;
1226 /* Realloc if we need. */
1227 if (set->alloc == set->nelem)
1229 Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1230 if (BE (new_elems == NULL, 0))
1232 set->elems = new_elems;
1235 /* Move the elements which follows the new element. Test the
1236 first element separately to skip a check in the inner loop. */
1237 if (elem < set->elems[0])
1240 for (idx = set->nelem; idx > 0; idx--)
1241 set->elems[idx] = set->elems[idx - 1];
1245 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1246 set->elems[idx] = set->elems[idx - 1];
1249 /* Insert the new element. */
1250 set->elems[idx] = elem;
1255 /* Insert the new element ELEM to the re_node_set* SET.
1256 SET should not already have any element greater than or equal to ELEM.
1257 Return true if successful. */
1261 re_node_set_insert_last (re_node_set *set, Idx elem)
1263 /* Realloc if we need. */
1264 if (set->alloc == set->nelem)
1267 new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1268 if (BE (new_elems == NULL, 0))
1270 set->elems = new_elems;
1273 /* Insert the new element. */
1274 set->elems[set->nelem++] = elem;
1278 /* Compare two node sets SET1 and SET2.
1279 Return true if SET1 and SET2 are equivalent. */
1282 internal_function __attribute ((pure))
1283 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1286 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1288 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1289 if (set1->elems[i] != set2->elems[i])
1294 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1297 internal_function __attribute ((pure))
1298 re_node_set_contains (const re_node_set *set, Idx elem)
1300 __re_size_t idx, right, mid;
1301 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1304 /* Binary search the element. */
1306 right = set->nelem - 1;
1309 mid = (idx + right) / 2;
1310 if (set->elems[mid] < elem)
1315 return set->elems[idx] == elem ? idx + 1 : 0;
1320 re_node_set_remove_at (re_node_set *set, Idx idx)
1322 if (idx < 0 || idx >= set->nelem)
1325 for (; idx < set->nelem; idx++)
1326 set->elems[idx] = set->elems[idx + 1];
1330 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1331 Or return REG_MISSING if an error occurred. */
1335 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1337 int type = token.type;
1338 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1340 Idx new_nodes_alloc = dfa->nodes_alloc;
1341 Idx *new_nexts, *new_indices;
1342 re_node_set *new_edests, *new_eclosures;
1344 re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1346 if (BE (new_nodes == NULL, 0))
1348 dfa->nodes = new_nodes;
1349 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1350 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1351 new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1352 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1353 if (BE (new_nexts == NULL || new_indices == NULL
1354 || new_edests == NULL || new_eclosures == NULL, 0))
1356 dfa->nexts = new_nexts;
1357 dfa->org_indices = new_indices;
1358 dfa->edests = new_edests;
1359 dfa->eclosures = new_eclosures;
1360 dfa->nodes_alloc = new_nodes_alloc;
1362 dfa->nodes[dfa->nodes_len] = token;
1363 dfa->nodes[dfa->nodes_len].constraint = 0;
1364 #ifdef RE_ENABLE_I18N
1365 dfa->nodes[dfa->nodes_len].accept_mb =
1366 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1368 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1369 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1370 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1371 return dfa->nodes_len++;
1374 static inline re_hashval_t
1376 calc_state_hash (const re_node_set *nodes, unsigned int context)
1378 re_hashval_t hash = nodes->nelem + context;
1380 for (i = 0 ; i < nodes->nelem ; i++)
1381 hash += nodes->elems[i];
1385 /* Search for the state whose node_set is equivalent to NODES.
1386 Return the pointer to the state, if we found it in the DFA.
1387 Otherwise create the new one and return it. In case of an error
1388 return NULL and set the error code in ERR.
1389 Note: - We assume NULL as the invalid state, then it is possible that
1390 return value is NULL and ERR is REG_NOERROR.
1391 - We never return non-NULL value in case of any errors, it is for
1394 static re_dfastate_t*
1396 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1399 re_dfastate_t *new_state;
1400 struct re_state_table_entry *spot;
1403 /* Suppress bogus uninitialized-variable warnings. */
1406 if (BE (nodes->nelem == 0, 0))
1411 hash = calc_state_hash (nodes, 0);
1412 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1414 for (i = 0 ; i < spot->num ; i++)
1416 re_dfastate_t *state = spot->array[i];
1417 if (hash != state->hash)
1419 if (re_node_set_compare (&state->nodes, nodes))
1423 /* There are no appropriate state in the dfa, create the new one. */
1424 new_state = create_ci_newstate (dfa, nodes, hash);
1425 if (BE (new_state != NULL, 1))
1434 /* Search for the state whose node_set is equivalent to NODES and
1435 whose context is equivalent to CONTEXT.
1436 Return the pointer to the state, if we found it in the DFA.
1437 Otherwise create the new one and return it. In case of an error
1438 return NULL and set the error code in ERR.
1439 Note: - We assume NULL as the invalid state, then it is possible that
1440 return value is NULL and ERR is REG_NOERROR.
1441 - We never return non-NULL value in case of any errors, it is for
1444 static re_dfastate_t*
1446 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1447 const re_node_set *nodes, unsigned int context)
1450 re_dfastate_t *new_state;
1451 struct re_state_table_entry *spot;
1454 /* Suppress bogus uninitialized-variable warnings. */
1457 if (nodes->nelem == 0)
1462 hash = calc_state_hash (nodes, context);
1463 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1465 for (i = 0 ; i < spot->num ; i++)
1467 re_dfastate_t *state = spot->array[i];
1468 if (state->hash == hash
1469 && state->context == context
1470 && re_node_set_compare (state->entrance_nodes, nodes))
1473 /* There are no appropriate state in `dfa', create the new one. */
1474 new_state = create_cd_newstate (dfa, nodes, context, hash);
1475 if (BE (new_state != NULL, 1))
1484 /* Finish initialization of the new state NEWSTATE, and using its hash value
1485 HASH put in the appropriate bucket of DFA's state table. Return value
1486 indicates the error code if failed. */
1488 static reg_errcode_t
1490 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1492 struct re_state_table_entry *spot;
1496 newstate->hash = hash;
1497 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1498 if (BE (err != REG_NOERROR, 0))
1500 for (i = 0; i < newstate->nodes.nelem; i++)
1502 Idx elem = newstate->nodes.elems[i];
1503 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1505 bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1511 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1512 if (BE (spot->alloc <= spot->num, 0))
1514 Idx new_alloc = spot->num;
1515 re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1517 if (BE (new_array == NULL, 0))
1519 spot->array = new_array;
1520 spot->alloc = new_alloc;
1522 spot->array[spot->num++] = newstate;
1526 /* Create the new state which is independ of contexts.
1527 Return the new state if succeeded, otherwise return NULL. */
1529 static re_dfastate_t *
1531 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1536 re_dfastate_t *newstate;
1538 newstate = re_calloc (re_dfastate_t, 1);
1539 if (BE (newstate == NULL, 0))
1541 err = re_node_set_init_copy (&newstate->nodes, nodes);
1542 if (BE (err != REG_NOERROR, 0))
1548 newstate->entrance_nodes = &newstate->nodes;
1549 for (i = 0 ; i < nodes->nelem ; i++)
1551 re_token_t *node = dfa->nodes + nodes->elems[i];
1552 re_token_type_t type = node->type;
1553 if (type == CHARACTER && !node->constraint)
1555 #ifdef RE_ENABLE_I18N
1556 newstate->accept_mb |= node->accept_mb;
1557 #endif /* RE_ENABLE_I18N */
1559 /* If the state has the halt node, the state is a halt state. */
1560 if (type == END_OF_RE)
1562 else if (type == OP_BACK_REF)
1563 newstate->has_backref = 1;
1564 else if (type == ANCHOR || node->constraint)
1565 newstate->has_constraint = 1;
1567 err = register_state (dfa, newstate, hash);
1568 if (BE (err != REG_NOERROR, 0))
1570 free_state (newstate);
1576 /* Create the new state which is depend on the context CONTEXT.
1577 Return the new state if succeeded, otherwise return NULL. */
1579 static re_dfastate_t *
1581 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1582 unsigned int context, re_hashval_t hash)
1584 Idx i, nctx_nodes = 0;
1586 re_dfastate_t *newstate;
1588 newstate = re_calloc (re_dfastate_t, 1);
1589 if (BE (newstate == NULL, 0))
1591 err = re_node_set_init_copy (&newstate->nodes, nodes);
1592 if (BE (err != REG_NOERROR, 0))
1598 newstate->context = context;
1599 newstate->entrance_nodes = &newstate->nodes;
1601 for (i = 0 ; i < nodes->nelem ; i++)
1603 unsigned int constraint = 0;
1604 re_token_t *node = dfa->nodes + nodes->elems[i];
1605 re_token_type_t type = node->type;
1606 if (node->constraint)
1607 constraint = node->constraint;
1609 if (type == CHARACTER && !constraint)
1611 #ifdef RE_ENABLE_I18N
1612 newstate->accept_mb |= node->accept_mb;
1613 #endif /* RE_ENABLE_I18N */
1615 /* If the state has the halt node, the state is a halt state. */
1616 if (type == END_OF_RE)
1618 else if (type == OP_BACK_REF)
1619 newstate->has_backref = 1;
1620 else if (type == ANCHOR)
1621 constraint = node->opr.ctx_type;
1625 if (newstate->entrance_nodes == &newstate->nodes)
1627 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1628 if (BE (newstate->entrance_nodes == NULL, 0))
1630 free_state (newstate);
1633 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1635 newstate->has_constraint = 1;
1638 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1640 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1645 err = register_state (dfa, newstate, hash);
1646 if (BE (err != REG_NOERROR, 0))
1648 free_state (newstate);
1656 free_state (re_dfastate_t *state)
1658 re_node_set_free (&state->non_eps_nodes);
1659 re_node_set_free (&state->inveclosure);
1660 if (state->entrance_nodes != &state->nodes)
1662 re_node_set_free (state->entrance_nodes);
1663 re_free (state->entrance_nodes);
1665 re_node_set_free (&state->nodes);
1666 re_free (state->word_trtable);
1667 re_free (state->trtable);