1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static void re_string_construct_common (const char *str, Idx len,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
30 re_hashval_t hash) internal_function;
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
63 /* This function allocate the buffers, and initialize them. */
67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
85 if (dfa->mb_cur_max > 1)
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
92 if (pstr->valid_raw_len >= len)
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
111 #endif /* RE_ENABLE_I18N */
114 re_string_translate_buffer (pstr);
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
137 /* Avoid overflow. */
138 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
142 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143 if (BE (new_wcs == NULL, 0))
146 if (pstr->offsets != NULL)
148 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149 if (BE (new_offsets == NULL, 0))
151 pstr->offsets = new_offsets;
154 #endif /* RE_ENABLE_I18N */
155 if (pstr->mbs_allocated)
157 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 if (BE (new_mbs == NULL, 0))
163 pstr->bufs_len = new_buf_len;
170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171 RE_TRANSLATE_TYPE trans, bool icase,
174 pstr->raw_mbs = (const unsigned char *) str;
179 pstr->mbs_allocated = (trans != NULL || icase);
180 pstr->mb_cur_max = dfa->mb_cur_max;
181 pstr->is_utf8 = dfa->is_utf8;
182 pstr->map_notascii = dfa->map_notascii;
183 pstr->stop = pstr->len;
184 pstr->raw_stop = pstr->stop;
187 #ifdef RE_ENABLE_I18N
189 /* Build wide character buffer PSTR->WCS.
190 If the byte sequence of the string are:
191 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192 Then wide character buffer will be:
193 <wc1> , WEOF , <wc2> , WEOF , <wc3>
194 We use WEOF for padding, they indicate that the position isn't
195 a first byte of a multibyte character.
197 Note that this function assumes PSTR->VALID_LEN elements are already
198 built and starts from PSTR->VALID_LEN. */
202 build_wcs_buffer (re_string_t *pstr)
205 unsigned char buf[MB_LEN_MAX];
206 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 unsigned char buf[64];
211 Idx byte_idx, end_idx, remain_len;
214 /* Build the buffers from pstr->valid_len to either pstr->len or
216 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
222 remain_len = end_idx - byte_idx;
223 prev_st = pstr->cur_state;
224 /* Apply the translation if we need. */
225 if (BE (pstr->trans != NULL, 0))
229 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234 p = (const char *) buf;
237 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239 if (BE (mbclen == (size_t) -2, 0))
241 /* The buffer doesn't have enough space, finish to build. */
242 pstr->cur_state = prev_st;
245 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
247 /* We treat these cases as a singlebyte character. */
249 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
250 if (BE (pstr->trans != NULL, 0))
251 wc = pstr->trans[wc];
252 pstr->cur_state = prev_st;
255 /* Write wide character and padding. */
256 pstr->wcs[byte_idx++] = wc;
257 /* Write paddings. */
258 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
259 pstr->wcs[byte_idx++] = WEOF;
261 pstr->valid_len = byte_idx;
262 pstr->valid_raw_len = byte_idx;
265 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
266 but for REG_ICASE. */
270 build_wcs_upper_buffer (re_string_t *pstr)
273 Idx src_idx, byte_idx, end_idx, remain_len;
276 char buf[MB_LEN_MAX];
277 assert (MB_LEN_MAX >= pstr->mb_cur_max);
282 byte_idx = pstr->valid_len;
283 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285 /* The following optimization assumes that ASCII characters can be
286 mapped to wide characters with a simple cast. */
287 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289 while (byte_idx < end_idx)
293 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
294 && mbsinit (&pstr->cur_state))
296 /* In case of a singlebyte character. */
298 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
299 /* The next step uses the assumption that wchar_t is encoded
300 ASCII-safe: all ASCII values can be converted like this. */
301 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
306 remain_len = end_idx - byte_idx;
307 prev_st = pstr->cur_state;
308 mbclen = mbrtowc (&wc,
309 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
310 + byte_idx), remain_len, &pstr->cur_state);
311 if (BE (mbclen < (size_t) -2, 1))
319 mbcdlen = wcrtomb (buf, wcu, &prev_st);
320 if (BE (mbclen == mbcdlen, 1))
321 memcpy (pstr->mbs + byte_idx, buf, mbclen);
329 memcpy (pstr->mbs + byte_idx,
330 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331 pstr->wcs[byte_idx++] = wcu;
332 /* Write paddings. */
333 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334 pstr->wcs[byte_idx++] = WEOF;
336 else if (mbclen == (size_t) -1 || mbclen == 0)
338 /* It is an invalid character or '\0'. Just use the byte. */
339 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
340 pstr->mbs[byte_idx] = ch;
341 /* And also cast it to wide char. */
342 pstr->wcs[byte_idx++] = (wchar_t) ch;
343 if (BE (mbclen == (size_t) -1, 0))
344 pstr->cur_state = prev_st;
348 /* The buffer doesn't have enough space, finish to build. */
349 pstr->cur_state = prev_st;
353 pstr->valid_len = byte_idx;
354 pstr->valid_raw_len = byte_idx;
358 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
363 remain_len = end_idx - byte_idx;
364 prev_st = pstr->cur_state;
365 if (BE (pstr->trans != NULL, 0))
369 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
371 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
372 buf[i] = pstr->trans[ch];
374 p = (const char *) buf;
377 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
378 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
379 if (BE (mbclen < (size_t) -2, 1))
387 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
388 if (BE (mbclen == mbcdlen, 1))
389 memcpy (pstr->mbs + byte_idx, buf, mbclen);
390 else if (mbcdlen != (size_t) -1)
394 if (byte_idx + mbcdlen > pstr->bufs_len)
396 pstr->cur_state = prev_st;
400 if (pstr->offsets == NULL)
402 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404 if (pstr->offsets == NULL)
407 if (!pstr->offsets_needed)
409 for (i = 0; i < (size_t) byte_idx; ++i)
410 pstr->offsets[i] = i;
411 pstr->offsets_needed = 1;
414 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
415 pstr->wcs[byte_idx] = wcu;
416 pstr->offsets[byte_idx] = src_idx;
417 for (i = 1; i < mbcdlen; ++i)
419 pstr->offsets[byte_idx + i]
420 = src_idx + (i < mbclen ? i : mbclen - 1);
421 pstr->wcs[byte_idx + i] = WEOF;
423 pstr->len += mbcdlen - mbclen;
424 if (pstr->raw_stop > src_idx)
425 pstr->stop += mbcdlen - mbclen;
426 end_idx = (pstr->bufs_len > pstr->len)
427 ? pstr->len : pstr->bufs_len;
433 memcpy (pstr->mbs + byte_idx, p, mbclen);
436 memcpy (pstr->mbs + byte_idx, p, mbclen);
438 if (BE (pstr->offsets_needed != 0, 0))
441 for (i = 0; i < mbclen; ++i)
442 pstr->offsets[byte_idx + i] = src_idx + i;
446 pstr->wcs[byte_idx++] = wcu;
447 /* Write paddings. */
448 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
449 pstr->wcs[byte_idx++] = WEOF;
451 else if (mbclen == (size_t) -1 || mbclen == 0)
453 /* It is an invalid character or '\0'. Just use the byte. */
454 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
456 if (BE (pstr->trans != NULL, 0))
457 ch = pstr->trans [ch];
458 pstr->mbs[byte_idx] = ch;
460 if (BE (pstr->offsets_needed != 0, 0))
461 pstr->offsets[byte_idx] = src_idx;
464 /* And also cast it to wide char. */
465 pstr->wcs[byte_idx++] = (wchar_t) ch;
466 if (BE (mbclen == (size_t) -1, 0))
467 pstr->cur_state = prev_st;
471 /* The buffer doesn't have enough space, finish to build. */
472 pstr->cur_state = prev_st;
476 pstr->valid_len = byte_idx;
477 pstr->valid_raw_len = src_idx;
481 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
486 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
493 /* Skip the characters which are not necessary to check. */
494 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
495 rawbuf_idx < new_raw_idx;)
499 remain_len = pstr->len - rawbuf_idx;
500 prev_st = pstr->cur_state;
501 mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
502 remain_len, &pstr->cur_state);
503 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505 /* We treat these cases as a single byte character. */
506 if (mbclen == 0 || remain_len == 0)
509 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
511 pstr->cur_state = prev_st;
515 /* Then proceed the next character. */
516 rawbuf_idx += mbclen;
521 #endif /* RE_ENABLE_I18N */
523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
524 This function is used in case of REG_ICASE. */
528 build_upper_buffer (re_string_t *pstr)
530 Idx char_idx, end_idx;
531 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
536 if (BE (pstr->trans != NULL, 0))
537 ch = pstr->trans[ch];
539 pstr->mbs[char_idx] = toupper (ch);
541 pstr->mbs[char_idx] = ch;
543 pstr->valid_len = char_idx;
544 pstr->valid_raw_len = char_idx;
547 /* Apply TRANS to the buffer in PSTR. */
551 re_string_translate_buffer (re_string_t *pstr)
553 Idx buf_idx, end_idx;
554 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
556 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
559 pstr->mbs[buf_idx] = pstr->trans[ch];
562 pstr->valid_len = buf_idx;
563 pstr->valid_raw_len = buf_idx;
566 /* This function re-construct the buffers.
567 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
568 convert to upper case in case of REG_ICASE, apply translation. */
572 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
576 if (BE (pstr->raw_mbs_idx <= idx, 0))
577 offset = idx - pstr->raw_mbs_idx;
581 #ifdef RE_ENABLE_I18N
582 if (pstr->mb_cur_max > 1)
583 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
584 #endif /* RE_ENABLE_I18N */
585 pstr->len = pstr->raw_len;
586 pstr->stop = pstr->raw_stop;
588 pstr->raw_mbs_idx = 0;
589 pstr->valid_raw_len = 0;
590 pstr->offsets_needed = 0;
591 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
592 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
593 if (!pstr->mbs_allocated)
594 pstr->mbs = (unsigned char *) pstr->raw_mbs;
598 if (BE (offset != 0, 1))
600 /* Are the characters which are already checked remain? */
601 if (BE (offset < pstr->valid_raw_len, 1)
602 #ifdef RE_ENABLE_I18N
603 /* Handling this would enlarge the code too much.
604 Accept a slowdown in that case. */
605 && pstr->offsets_needed == 0
609 /* Yes, move them to the front of the buffer. */
610 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
611 #ifdef RE_ENABLE_I18N
612 if (pstr->mb_cur_max > 1)
613 memmove (pstr->wcs, pstr->wcs + offset,
614 (pstr->valid_len - offset) * sizeof (wint_t));
615 #endif /* RE_ENABLE_I18N */
616 if (BE (pstr->mbs_allocated, 0))
617 memmove (pstr->mbs, pstr->mbs + offset,
618 pstr->valid_len - offset);
619 pstr->valid_len -= offset;
620 pstr->valid_raw_len -= offset;
622 assert (pstr->valid_len > 0);
627 /* No, skip all characters until IDX. */
628 #ifdef RE_ENABLE_I18N
629 if (BE (pstr->offsets_needed, 0))
631 pstr->len = pstr->raw_len - idx + offset;
632 pstr->stop = pstr->raw_stop - idx + offset;
633 pstr->offsets_needed = 0;
637 #ifdef RE_ENABLE_I18N
638 if (pstr->mb_cur_max > 1)
645 const unsigned char *raw, *p, *q, *end;
647 /* Special case UTF-8. Multi-byte chars start with any
648 byte other than 0x80 - 0xbf. */
649 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
650 end = raw + (offset - pstr->mb_cur_max);
651 p = raw + offset - 1;
653 /* We know the wchar_t encoding is UCS4, so for the simple
654 case, ASCII characters, skip the conversion step. */
655 if (isascii (*p) && BE (pstr->trans == NULL, 1))
657 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
663 for (; p >= end; --p)
664 if ((*p & 0xc0) != 0x80)
668 Idx mlen = raw + pstr->len - p;
669 unsigned char buf[6];
673 if (BE (pstr->trans != NULL, 0))
675 int i = mlen < 6 ? mlen : 6;
677 buf[i] = pstr->trans[p[i]];
680 /* XXX Don't use mbrtowc, we know which conversion
681 to use (UTF-8 -> UCS4). */
682 memset (&cur_state, 0, sizeof (cur_state));
683 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
685 if (raw + offset - p <= mbclen
686 && mbclen < (size_t) -2)
688 memset (&pstr->cur_state, '\0',
690 pstr->valid_len = mbclen - (raw + offset - p);
698 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
701 = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
703 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
704 && IS_WIDE_WORD_CHAR (wc))
706 : ((IS_WIDE_NEWLINE (wc)
707 && pstr->newline_anchor)
708 ? CONTEXT_NEWLINE : 0));
709 if (BE (pstr->valid_len, 0))
711 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
712 pstr->wcs[wcs_idx] = WEOF;
713 if (pstr->mbs_allocated)
714 memset (pstr->mbs, -1, pstr->valid_len);
716 pstr->valid_raw_len = pstr->valid_len;
719 #endif /* RE_ENABLE_I18N */
721 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
722 pstr->valid_raw_len = 0;
725 pstr->tip_context = (bitset_contain (pstr->word_char, c)
727 : ((IS_NEWLINE (c) && pstr->newline_anchor)
728 ? CONTEXT_NEWLINE : 0));
731 if (!BE (pstr->mbs_allocated, 0))
734 pstr->raw_mbs_idx = idx;
736 pstr->stop -= offset;
738 /* Then build the buffers. */
739 #ifdef RE_ENABLE_I18N
740 if (pstr->mb_cur_max > 1)
744 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
745 if (BE (ret != REG_NOERROR, 0))
749 build_wcs_buffer (pstr);
752 #endif /* RE_ENABLE_I18N */
753 if (BE (pstr->mbs_allocated, 0))
756 build_upper_buffer (pstr);
757 else if (pstr->trans != NULL)
758 re_string_translate_buffer (pstr);
761 pstr->valid_len = pstr->len;
768 internal_function __attribute ((pure))
769 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
774 /* Handle the common (easiest) cases first. */
775 if (BE (!pstr->mbs_allocated, 1))
776 return re_string_peek_byte (pstr, idx);
778 #ifdef RE_ENABLE_I18N
779 if (pstr->mb_cur_max > 1
780 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
781 return re_string_peek_byte (pstr, idx);
784 off = pstr->cur_idx + idx;
785 #ifdef RE_ENABLE_I18N
786 if (pstr->offsets_needed)
787 off = pstr->offsets[off];
790 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
792 #ifdef RE_ENABLE_I18N
793 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
794 this function returns CAPITAL LETTER I instead of first byte of
795 DOTLESS SMALL LETTER I. The latter would confuse the parser,
796 since peek_byte_case doesn't advance cur_idx in any way. */
797 if (pstr->offsets_needed && !isascii (ch))
798 return re_string_peek_byte (pstr, idx);
805 internal_function __attribute ((pure))
806 re_string_fetch_byte_case (re_string_t *pstr)
808 if (BE (!pstr->mbs_allocated, 1))
809 return re_string_fetch_byte (pstr);
811 #ifdef RE_ENABLE_I18N
812 if (pstr->offsets_needed)
817 /* For tr_TR.UTF-8 [[:islower:]] there is
818 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
819 in that case the whole multi-byte character and return
820 the original letter. On the other side, with
821 [[: DOTLESS SMALL LETTER I return [[:I, as doing
822 anything else would complicate things too much. */
824 if (!re_string_first_byte (pstr, pstr->cur_idx))
825 return re_string_fetch_byte (pstr);
827 off = pstr->offsets[pstr->cur_idx];
828 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
831 return re_string_fetch_byte (pstr);
833 re_string_skip_bytes (pstr,
834 re_string_char_size_at (pstr, pstr->cur_idx));
839 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
844 re_string_destruct (re_string_t *pstr)
846 #ifdef RE_ENABLE_I18N
848 re_free (pstr->offsets);
849 #endif /* RE_ENABLE_I18N */
850 if (pstr->mbs_allocated)
854 /* Return the context at IDX in INPUT. */
858 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
861 if (BE (! REG_VALID_INDEX (idx), 0))
862 /* In this case, we use the value stored in input->tip_context,
863 since we can't know the character in input->mbs[-1] here. */
864 return input->tip_context;
865 if (BE (idx == input->len, 0))
866 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
867 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
868 #ifdef RE_ENABLE_I18N
869 if (input->mb_cur_max > 1)
873 while(input->wcs[wc_idx] == WEOF)
876 /* It must not happen. */
877 assert (REG_VALID_INDEX (wc_idx));
880 if (! REG_VALID_INDEX (wc_idx))
881 return input->tip_context;
883 wc = input->wcs[wc_idx];
884 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
886 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
887 ? CONTEXT_NEWLINE : 0);
892 c = re_string_byte_at (input, idx);
893 if (bitset_contain (input->word_char, c))
895 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
899 /* Functions for set operation. */
903 re_node_set_alloc (re_node_set *set, Idx size)
907 set->elems = re_malloc (Idx, size);
908 if (BE (set->elems == NULL, 0))
915 re_node_set_init_1 (re_node_set *set, Idx elem)
919 set->elems = re_malloc (Idx, 1);
920 if (BE (set->elems == NULL, 0))
922 set->alloc = set->nelem = 0;
925 set->elems[0] = elem;
931 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
934 set->elems = re_malloc (Idx, 2);
935 if (BE (set->elems == NULL, 0))
940 set->elems[0] = elem1;
947 set->elems[0] = elem1;
948 set->elems[1] = elem2;
952 set->elems[0] = elem2;
953 set->elems[1] = elem1;
961 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
963 dest->nelem = src->nelem;
966 dest->alloc = dest->nelem;
967 dest->elems = re_malloc (Idx, dest->alloc);
968 if (BE (dest->elems == NULL, 0))
970 dest->alloc = dest->nelem = 0;
973 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
976 re_node_set_init_empty (dest);
980 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
981 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
982 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
986 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
987 const re_node_set *src2)
989 Idx i1, i2, is, id, delta, sbase;
990 if (src1->nelem == 0 || src2->nelem == 0)
993 /* We need dest->nelem + 2 * elems_in_intersection; this is a
994 conservative estimate. */
995 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
997 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
998 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
999 if (BE (new_elems == NULL, 0))
1001 dest->elems = new_elems;
1002 dest->alloc = new_alloc;
1005 /* Find the items in the intersection of SRC1 and SRC2, and copy
1006 into the top of DEST those that are not already in DEST itself. */
1007 sbase = dest->nelem + src1->nelem + src2->nelem;
1008 i1 = src1->nelem - 1;
1009 i2 = src2->nelem - 1;
1010 id = dest->nelem - 1;
1013 if (src1->elems[i1] == src2->elems[i2])
1015 /* Try to find the item in DEST. Maybe we could binary search? */
1016 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1019 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1020 dest->elems[--sbase] = src1->elems[i1];
1022 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1026 /* Lower the highest of the two items. */
1027 else if (src1->elems[i1] < src2->elems[i2])
1029 if (! REG_VALID_INDEX (--i2))
1034 if (! REG_VALID_INDEX (--i1))
1039 id = dest->nelem - 1;
1040 is = dest->nelem + src1->nelem + src2->nelem - 1;
1041 delta = is - sbase + 1;
1043 /* Now copy. When DELTA becomes zero, the remaining
1044 DEST elements are already in place; this is more or
1045 less the same loop that is in re_node_set_merge. */
1046 dest->nelem += delta;
1047 if (delta > 0 && REG_VALID_INDEX (id))
1050 if (dest->elems[is] > dest->elems[id])
1052 /* Copy from the top. */
1053 dest->elems[id + delta--] = dest->elems[is--];
1059 /* Slide from the bottom. */
1060 dest->elems[id + delta] = dest->elems[id];
1061 if (! REG_VALID_INDEX (--id))
1066 /* Copy remaining SRC elements. */
1067 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1072 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1073 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1075 static reg_errcode_t
1077 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1078 const re_node_set *src2)
1081 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1083 dest->alloc = src1->nelem + src2->nelem;
1084 dest->elems = re_malloc (Idx, dest->alloc);
1085 if (BE (dest->elems == NULL, 0))
1090 if (src1 != NULL && src1->nelem > 0)
1091 return re_node_set_init_copy (dest, src1);
1092 else if (src2 != NULL && src2->nelem > 0)
1093 return re_node_set_init_copy (dest, src2);
1095 re_node_set_init_empty (dest);
1098 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1100 if (src1->elems[i1] > src2->elems[i2])
1102 dest->elems[id++] = src2->elems[i2++];
1105 if (src1->elems[i1] == src2->elems[i2])
1107 dest->elems[id++] = src1->elems[i1++];
1109 if (i1 < src1->nelem)
1111 memcpy (dest->elems + id, src1->elems + i1,
1112 (src1->nelem - i1) * sizeof (Idx));
1113 id += src1->nelem - i1;
1115 else if (i2 < src2->nelem)
1117 memcpy (dest->elems + id, src2->elems + i2,
1118 (src2->nelem - i2) * sizeof (Idx));
1119 id += src2->nelem - i2;
1125 /* Calculate the union set of the sets DEST and SRC. And store it to
1126 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1128 static reg_errcode_t
1130 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1132 Idx is, id, sbase, delta;
1133 if (src == NULL || src->nelem == 0)
1135 if (dest->alloc < 2 * src->nelem + dest->nelem)
1137 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1138 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1139 if (BE (new_buffer == NULL, 0))
1141 dest->elems = new_buffer;
1142 dest->alloc = new_alloc;
1145 if (BE (dest->nelem == 0, 0))
1147 dest->nelem = src->nelem;
1148 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1152 /* Copy into the top of DEST the items of SRC that are not
1153 found in DEST. Maybe we could binary search in DEST? */
1154 for (sbase = dest->nelem + 2 * src->nelem,
1155 is = src->nelem - 1, id = dest->nelem - 1;
1156 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1158 if (dest->elems[id] == src->elems[is])
1160 else if (dest->elems[id] < src->elems[is])
1161 dest->elems[--sbase] = src->elems[is--];
1162 else /* if (dest->elems[id] > src->elems[is]) */
1166 if (REG_VALID_INDEX (is))
1168 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1170 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1173 id = dest->nelem - 1;
1174 is = dest->nelem + 2 * src->nelem - 1;
1175 delta = is - sbase + 1;
1179 /* Now copy. When DELTA becomes zero, the remaining
1180 DEST elements are already in place. */
1181 dest->nelem += delta;
1184 if (dest->elems[is] > dest->elems[id])
1186 /* Copy from the top. */
1187 dest->elems[id + delta--] = dest->elems[is--];
1193 /* Slide from the bottom. */
1194 dest->elems[id + delta] = dest->elems[id];
1195 if (! REG_VALID_INDEX (--id))
1197 /* Copy remaining SRC elements. */
1198 memcpy (dest->elems, dest->elems + sbase,
1199 delta * sizeof (Idx));
1208 /* Insert the new element ELEM to the re_node_set* SET.
1209 SET should not already have ELEM.
1210 Return true if successful. */
1214 re_node_set_insert (re_node_set *set, Idx elem)
1217 /* In case the set is empty. */
1218 if (set->alloc == 0)
1219 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1221 if (BE (set->nelem, 0) == 0)
1223 /* We already guaranteed above that set->alloc != 0. */
1224 set->elems[0] = elem;
1229 /* Realloc if we need. */
1230 if (set->alloc == set->nelem)
1233 set->alloc = set->alloc * 2;
1234 new_elems = re_realloc (set->elems, Idx, set->alloc);
1235 if (BE (new_elems == NULL, 0))
1237 set->elems = new_elems;
1240 /* Move the elements which follows the new element. Test the
1241 first element separately to skip a check in the inner loop. */
1242 if (elem < set->elems[0])
1245 for (idx = set->nelem; idx > 0; idx--)
1246 set->elems[idx] = set->elems[idx - 1];
1250 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1251 set->elems[idx] = set->elems[idx - 1];
1254 /* Insert the new element. */
1255 set->elems[idx] = elem;
1260 /* Insert the new element ELEM to the re_node_set* SET.
1261 SET should not already have any element greater than or equal to ELEM.
1262 Return true if successful. */
1266 re_node_set_insert_last (re_node_set *set, Idx elem)
1268 /* Realloc if we need. */
1269 if (set->alloc == set->nelem)
1272 set->alloc = (set->alloc + 1) * 2;
1273 new_elems = re_realloc (set->elems, Idx, set->alloc);
1274 if (BE (new_elems == NULL, 0))
1276 set->elems = new_elems;
1279 /* Insert the new element. */
1280 set->elems[set->nelem++] = elem;
1284 /* Compare two node sets SET1 and SET2.
1285 Return true if SET1 and SET2 are equivalent. */
1288 internal_function __attribute ((pure))
1289 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1292 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1294 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1295 if (set1->elems[i] != set2->elems[i])
1300 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1303 internal_function __attribute ((pure))
1304 re_node_set_contains (const re_node_set *set, Idx elem)
1306 __re_size_t idx, right, mid;
1307 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1310 /* Binary search the element. */
1312 right = set->nelem - 1;
1315 mid = (idx + right) / 2;
1316 if (set->elems[mid] < elem)
1321 return set->elems[idx] == elem ? idx + 1 : 0;
1326 re_node_set_remove_at (re_node_set *set, Idx idx)
1328 if (idx < 0 || idx >= set->nelem)
1331 for (; idx < set->nelem; idx++)
1332 set->elems[idx] = set->elems[idx + 1];
1336 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1337 Or return REG_MISSING if an error occurred. */
1341 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1343 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1345 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1346 Idx *new_nexts, *new_indices;
1347 re_node_set *new_edests, *new_eclosures;
1348 re_token_t *new_nodes;
1349 size_t max_object_size =
1350 MAX (sizeof (re_token_t),
1351 MAX (sizeof (re_node_set),
1354 /* Avoid overflows. */
1355 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1358 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1359 if (BE (new_nodes == NULL, 0))
1361 dfa->nodes = new_nodes;
1362 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1363 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1364 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1365 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1366 if (BE (new_nexts == NULL || new_indices == NULL
1367 || new_edests == NULL || new_eclosures == NULL, 0))
1369 dfa->nexts = new_nexts;
1370 dfa->org_indices = new_indices;
1371 dfa->edests = new_edests;
1372 dfa->eclosures = new_eclosures;
1373 dfa->nodes_alloc = new_nodes_alloc;
1375 dfa->nodes[dfa->nodes_len] = token;
1376 dfa->nodes[dfa->nodes_len].constraint = 0;
1377 #ifdef RE_ENABLE_I18N
1379 int type = token.type;
1380 dfa->nodes[dfa->nodes_len].accept_mb =
1381 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1384 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1385 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1386 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1387 return dfa->nodes_len++;
1390 static inline re_hashval_t
1392 calc_state_hash (const re_node_set *nodes, unsigned int context)
1394 re_hashval_t hash = nodes->nelem + context;
1396 for (i = 0 ; i < nodes->nelem ; i++)
1397 hash += nodes->elems[i];
1401 /* Search for the state whose node_set is equivalent to NODES.
1402 Return the pointer to the state, if we found it in the DFA.
1403 Otherwise create the new one and return it. In case of an error
1404 return NULL and set the error code in ERR.
1405 Note: - We assume NULL as the invalid state, then it is possible that
1406 return value is NULL and ERR is REG_NOERROR.
1407 - We never return non-NULL value in case of any errors, it is for
1410 static re_dfastate_t *
1412 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1413 const re_node_set *nodes)
1416 re_dfastate_t *new_state;
1417 struct re_state_table_entry *spot;
1420 /* Suppress bogus uninitialized-variable warnings. */
1423 if (BE (nodes->nelem == 0, 0))
1428 hash = calc_state_hash (nodes, 0);
1429 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1431 for (i = 0 ; i < spot->num ; i++)
1433 re_dfastate_t *state = spot->array[i];
1434 if (hash != state->hash)
1436 if (re_node_set_compare (&state->nodes, nodes))
1440 /* There are no appropriate state in the dfa, create the new one. */
1441 new_state = create_ci_newstate (dfa, nodes, hash);
1442 if (BE (new_state == NULL, 0))
1448 /* Search for the state whose node_set is equivalent to NODES and
1449 whose context is equivalent to CONTEXT.
1450 Return the pointer to the state, if we found it in the DFA.
1451 Otherwise create the new one and return it. In case of an error
1452 return NULL and set the error code in ERR.
1453 Note: - We assume NULL as the invalid state, then it is possible that
1454 return value is NULL and ERR is REG_NOERROR.
1455 - We never return non-NULL value in case of any errors, it is for
1458 static re_dfastate_t *
1460 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1461 const re_node_set *nodes, unsigned int context)
1464 re_dfastate_t *new_state;
1465 struct re_state_table_entry *spot;
1468 /* Suppress bogus uninitialized-variable warnings. */
1471 if (nodes->nelem == 0)
1476 hash = calc_state_hash (nodes, context);
1477 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1479 for (i = 0 ; i < spot->num ; i++)
1481 re_dfastate_t *state = spot->array[i];
1482 if (state->hash == hash
1483 && state->context == context
1484 && re_node_set_compare (state->entrance_nodes, nodes))
1487 /* There are no appropriate state in `dfa', create the new one. */
1488 new_state = create_cd_newstate (dfa, nodes, context, hash);
1489 if (BE (new_state == NULL, 0))
1495 /* Finish initialization of the new state NEWSTATE, and using its hash value
1496 HASH put in the appropriate bucket of DFA's state table. Return value
1497 indicates the error code if failed. */
1499 static reg_errcode_t
1500 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1503 struct re_state_table_entry *spot;
1507 newstate->hash = hash;
1508 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1509 if (BE (err != REG_NOERROR, 0))
1511 for (i = 0; i < newstate->nodes.nelem; i++)
1513 Idx elem = newstate->nodes.elems[i];
1514 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1515 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1519 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1520 if (BE (spot->alloc <= spot->num, 0))
1522 Idx new_alloc = 2 * spot->num + 2;
1523 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1525 if (BE (new_array == NULL, 0))
1527 spot->array = new_array;
1528 spot->alloc = new_alloc;
1530 spot->array[spot->num++] = newstate;
1535 free_state (re_dfastate_t *state)
1537 re_node_set_free (&state->non_eps_nodes);
1538 re_node_set_free (&state->inveclosure);
1539 if (state->entrance_nodes != &state->nodes)
1541 re_node_set_free (state->entrance_nodes);
1542 re_free (state->entrance_nodes);
1544 re_node_set_free (&state->nodes);
1545 re_free (state->word_trtable);
1546 re_free (state->trtable);
1550 /* Create the new state which is independ of contexts.
1551 Return the new state if succeeded, otherwise return NULL. */
1553 static re_dfastate_t *
1555 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1560 re_dfastate_t *newstate;
1562 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1563 if (BE (newstate == NULL, 0))
1565 err = re_node_set_init_copy (&newstate->nodes, nodes);
1566 if (BE (err != REG_NOERROR, 0))
1572 newstate->entrance_nodes = &newstate->nodes;
1573 for (i = 0 ; i < nodes->nelem ; i++)
1575 re_token_t *node = dfa->nodes + nodes->elems[i];
1576 re_token_type_t type = node->type;
1577 if (type == CHARACTER && !node->constraint)
1579 #ifdef RE_ENABLE_I18N
1580 newstate->accept_mb |= node->accept_mb;
1581 #endif /* RE_ENABLE_I18N */
1583 /* If the state has the halt node, the state is a halt state. */
1584 if (type == END_OF_RE)
1586 else if (type == OP_BACK_REF)
1587 newstate->has_backref = 1;
1588 else if (type == ANCHOR || node->constraint)
1589 newstate->has_constraint = 1;
1591 err = register_state (dfa, newstate, hash);
1592 if (BE (err != REG_NOERROR, 0))
1594 free_state (newstate);
1600 /* Create the new state which is depend on the context CONTEXT.
1601 Return the new state if succeeded, otherwise return NULL. */
1603 static re_dfastate_t *
1605 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1606 unsigned int context, re_hashval_t hash)
1608 Idx i, nctx_nodes = 0;
1610 re_dfastate_t *newstate;
1612 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1613 if (BE (newstate == NULL, 0))
1615 err = re_node_set_init_copy (&newstate->nodes, nodes);
1616 if (BE (err != REG_NOERROR, 0))
1622 newstate->context = context;
1623 newstate->entrance_nodes = &newstate->nodes;
1625 for (i = 0 ; i < nodes->nelem ; i++)
1627 unsigned int constraint = 0;
1628 re_token_t *node = dfa->nodes + nodes->elems[i];
1629 re_token_type_t type = node->type;
1630 if (node->constraint)
1631 constraint = node->constraint;
1633 if (type == CHARACTER && !constraint)
1635 #ifdef RE_ENABLE_I18N
1636 newstate->accept_mb |= node->accept_mb;
1637 #endif /* RE_ENABLE_I18N */
1639 /* If the state has the halt node, the state is a halt state. */
1640 if (type == END_OF_RE)
1642 else if (type == OP_BACK_REF)
1643 newstate->has_backref = 1;
1644 else if (type == ANCHOR)
1645 constraint = node->opr.ctx_type;
1649 if (newstate->entrance_nodes == &newstate->nodes)
1651 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1652 if (BE (newstate->entrance_nodes == NULL, 0))
1654 free_state (newstate);
1657 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1659 newstate->has_constraint = 1;
1662 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1664 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1669 err = register_state (dfa, newstate, hash);
1670 if (BE (err != REG_NOERROR, 0))
1672 free_state (newstate);