1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static void re_string_construct_common (const char *str, Idx len,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
30 re_hashval_t hash) internal_function;
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
63 /* This function allocate the buffers, and initialize them. */
67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
85 if (dfa->mb_cur_max > 1)
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
92 if (pstr->valid_raw_len >= len)
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
111 #endif /* RE_ENABLE_I18N */
114 re_string_translate_buffer (pstr);
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
137 /* Avoid overflow. */
138 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
142 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143 if (BE (new_wcs == NULL, 0))
146 if (pstr->offsets != NULL)
148 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149 if (BE (new_offsets == NULL, 0))
151 pstr->offsets = new_offsets;
154 #endif /* RE_ENABLE_I18N */
155 if (pstr->mbs_allocated)
157 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 if (BE (new_mbs == NULL, 0))
163 pstr->bufs_len = new_buf_len;
170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171 RE_TRANSLATE_TYPE trans, bool icase,
174 pstr->raw_mbs = (const unsigned char *) str;
179 pstr->mbs_allocated = (trans != NULL || icase);
180 pstr->mb_cur_max = dfa->mb_cur_max;
181 pstr->is_utf8 = dfa->is_utf8;
182 pstr->map_notascii = dfa->map_notascii;
183 pstr->stop = pstr->len;
184 pstr->raw_stop = pstr->stop;
187 #ifdef RE_ENABLE_I18N
189 /* Build wide character buffer PSTR->WCS.
190 If the byte sequence of the string are:
191 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192 Then wide character buffer will be:
193 <wc1> , WEOF , <wc2> , WEOF , <wc3>
194 We use WEOF for padding, they indicate that the position isn't
195 a first byte of a multibyte character.
197 Note that this function assumes PSTR->VALID_LEN elements are already
198 built and starts from PSTR->VALID_LEN. */
202 build_wcs_buffer (re_string_t *pstr)
205 unsigned char buf[MB_LEN_MAX];
206 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 unsigned char buf[64];
211 Idx byte_idx, end_idx, remain_len;
214 /* Build the buffers from pstr->valid_len to either pstr->len or
216 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
222 remain_len = end_idx - byte_idx;
223 prev_st = pstr->cur_state;
224 /* Apply the translation if we need. */
225 if (BE (pstr->trans != NULL, 0))
229 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234 p = (const char *) buf;
237 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239 if (BE (mbclen == (size_t) -2, 0))
241 /* The buffer doesn't have enough space, finish to build. */
242 pstr->cur_state = prev_st;
245 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
247 /* We treat these cases as a singlebyte character. */
249 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
250 if (BE (pstr->trans != NULL, 0))
251 wc = pstr->trans[wc];
252 pstr->cur_state = prev_st;
255 /* Write wide character and padding. */
256 pstr->wcs[byte_idx++] = wc;
257 /* Write paddings. */
258 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
259 pstr->wcs[byte_idx++] = WEOF;
261 pstr->valid_len = byte_idx;
262 pstr->valid_raw_len = byte_idx;
265 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
266 but for REG_ICASE. */
270 build_wcs_upper_buffer (re_string_t *pstr)
273 Idx src_idx, byte_idx, end_idx, remain_len;
276 char buf[MB_LEN_MAX];
277 assert (MB_LEN_MAX >= pstr->mb_cur_max);
282 byte_idx = pstr->valid_len;
283 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285 /* The following optimization assumes that ASCII characters can be
286 mapped to wide characters with a simple cast. */
287 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289 while (byte_idx < end_idx)
293 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
294 && mbsinit (&pstr->cur_state))
296 /* In case of a singlebyte character. */
298 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
299 /* The next step uses the assumption that wchar_t is encoded
300 ASCII-safe: all ASCII values can be converted like this. */
301 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
306 remain_len = end_idx - byte_idx;
307 prev_st = pstr->cur_state;
308 mbclen = mbrtowc (&wc,
309 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
310 + byte_idx), remain_len, &pstr->cur_state);
311 if (BE (mbclen < (size_t) -2, 1))
319 mbcdlen = wcrtomb (buf, wcu, &prev_st);
320 if (BE (mbclen == mbcdlen, 1))
321 memcpy (pstr->mbs + byte_idx, buf, mbclen);
329 memcpy (pstr->mbs + byte_idx,
330 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331 pstr->wcs[byte_idx++] = wcu;
332 /* Write paddings. */
333 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334 pstr->wcs[byte_idx++] = WEOF;
336 else if (mbclen == (size_t) -1 || mbclen == 0)
338 /* It is an invalid character or '\0'. Just use the byte. */
339 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
340 pstr->mbs[byte_idx] = ch;
341 /* And also cast it to wide char. */
342 pstr->wcs[byte_idx++] = (wchar_t) ch;
343 if (BE (mbclen == (size_t) -1, 0))
344 pstr->cur_state = prev_st;
348 /* The buffer doesn't have enough space, finish to build. */
349 pstr->cur_state = prev_st;
353 pstr->valid_len = byte_idx;
354 pstr->valid_raw_len = byte_idx;
358 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
363 remain_len = end_idx - byte_idx;
364 prev_st = pstr->cur_state;
365 if (BE (pstr->trans != NULL, 0))
369 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
371 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
372 buf[i] = pstr->trans[ch];
374 p = (const char *) buf;
377 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
378 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
379 if (BE (mbclen < (size_t) -2, 1))
387 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
388 if (BE (mbclen == mbcdlen, 1))
389 memcpy (pstr->mbs + byte_idx, buf, mbclen);
390 else if (mbcdlen != (size_t) -1)
394 if (byte_idx + mbcdlen > pstr->bufs_len)
396 pstr->cur_state = prev_st;
400 if (pstr->offsets == NULL)
402 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404 if (pstr->offsets == NULL)
407 if (!pstr->offsets_needed)
409 for (i = 0; i < (size_t) byte_idx; ++i)
410 pstr->offsets[i] = i;
411 pstr->offsets_needed = 1;
414 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
415 pstr->wcs[byte_idx] = wcu;
416 pstr->offsets[byte_idx] = src_idx;
417 for (i = 1; i < mbcdlen; ++i)
419 pstr->offsets[byte_idx + i]
420 = src_idx + (i < mbclen ? i : mbclen - 1);
421 pstr->wcs[byte_idx + i] = WEOF;
423 pstr->len += mbcdlen - mbclen;
424 if (pstr->raw_stop > src_idx)
425 pstr->stop += mbcdlen - mbclen;
426 end_idx = (pstr->bufs_len > pstr->len)
427 ? pstr->len : pstr->bufs_len;
433 memcpy (pstr->mbs + byte_idx, p, mbclen);
436 memcpy (pstr->mbs + byte_idx, p, mbclen);
438 if (BE (pstr->offsets_needed != 0, 0))
441 for (i = 0; i < mbclen; ++i)
442 pstr->offsets[byte_idx + i] = src_idx + i;
446 pstr->wcs[byte_idx++] = wcu;
447 /* Write paddings. */
448 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
449 pstr->wcs[byte_idx++] = WEOF;
451 else if (mbclen == (size_t) -1 || mbclen == 0)
453 /* It is an invalid character or '\0'. Just use the byte. */
454 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
456 if (BE (pstr->trans != NULL, 0))
457 ch = pstr->trans [ch];
458 pstr->mbs[byte_idx] = ch;
460 if (BE (pstr->offsets_needed != 0, 0))
461 pstr->offsets[byte_idx] = src_idx;
464 /* And also cast it to wide char. */
465 pstr->wcs[byte_idx++] = (wchar_t) ch;
466 if (BE (mbclen == (size_t) -1, 0))
467 pstr->cur_state = prev_st;
471 /* The buffer doesn't have enough space, finish to build. */
472 pstr->cur_state = prev_st;
476 pstr->valid_len = byte_idx;
477 pstr->valid_raw_len = src_idx;
481 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
486 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
493 /* Skip the characters which are not necessary to check. */
494 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
495 rawbuf_idx < new_raw_idx;)
498 remain_len = pstr->len - rawbuf_idx;
499 prev_st = pstr->cur_state;
500 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
501 remain_len, &pstr->cur_state);
502 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
504 /* We treat these cases as a single byte character. */
505 if (mbclen == 0 || remain_len == 0)
508 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
510 pstr->cur_state = prev_st;
512 /* Then proceed the next character. */
513 rawbuf_idx += mbclen;
515 *last_wc = (wint_t) wc;
518 #endif /* RE_ENABLE_I18N */
520 /* Build the buffer PSTR->MBS, and apply the translation if we need.
521 This function is used in case of REG_ICASE. */
525 build_upper_buffer (re_string_t *pstr)
527 Idx char_idx, end_idx;
528 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
530 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
532 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
533 if (BE (pstr->trans != NULL, 0))
534 ch = pstr->trans[ch];
536 pstr->mbs[char_idx] = toupper (ch);
538 pstr->mbs[char_idx] = ch;
540 pstr->valid_len = char_idx;
541 pstr->valid_raw_len = char_idx;
544 /* Apply TRANS to the buffer in PSTR. */
548 re_string_translate_buffer (re_string_t *pstr)
550 Idx buf_idx, end_idx;
551 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
553 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
555 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
556 pstr->mbs[buf_idx] = pstr->trans[ch];
559 pstr->valid_len = buf_idx;
560 pstr->valid_raw_len = buf_idx;
563 /* This function re-construct the buffers.
564 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
565 convert to upper case in case of REG_ICASE, apply translation. */
569 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
573 if (BE (pstr->raw_mbs_idx <= idx, 0))
574 offset = idx - pstr->raw_mbs_idx;
578 #ifdef RE_ENABLE_I18N
579 if (pstr->mb_cur_max > 1)
580 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
581 #endif /* RE_ENABLE_I18N */
582 pstr->len = pstr->raw_len;
583 pstr->stop = pstr->raw_stop;
585 pstr->raw_mbs_idx = 0;
586 pstr->valid_raw_len = 0;
587 pstr->offsets_needed = 0;
588 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
589 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
590 if (!pstr->mbs_allocated)
591 pstr->mbs = (unsigned char *) pstr->raw_mbs;
595 if (BE (offset != 0, 1))
597 /* Are the characters which are already checked remain? */
598 if (BE (offset < pstr->valid_raw_len, 1)
599 #ifdef RE_ENABLE_I18N
600 /* Handling this would enlarge the code too much.
601 Accept a slowdown in that case. */
602 && pstr->offsets_needed == 0
606 /* Yes, move them to the front of the buffer. */
607 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
608 #ifdef RE_ENABLE_I18N
609 if (pstr->mb_cur_max > 1)
610 memmove (pstr->wcs, pstr->wcs + offset,
611 (pstr->valid_len - offset) * sizeof (wint_t));
612 #endif /* RE_ENABLE_I18N */
613 if (BE (pstr->mbs_allocated, 0))
614 memmove (pstr->mbs, pstr->mbs + offset,
615 pstr->valid_len - offset);
616 pstr->valid_len -= offset;
617 pstr->valid_raw_len -= offset;
619 assert (pstr->valid_len > 0);
624 /* No, skip all characters until IDX. */
625 #ifdef RE_ENABLE_I18N
626 if (BE (pstr->offsets_needed, 0))
628 pstr->len = pstr->raw_len - idx + offset;
629 pstr->stop = pstr->raw_stop - idx + offset;
630 pstr->offsets_needed = 0;
634 #ifdef RE_ENABLE_I18N
635 if (pstr->mb_cur_max > 1)
642 const unsigned char *raw, *p, *q, *end;
644 /* Special case UTF-8. Multi-byte chars start with any
645 byte other than 0x80 - 0xbf. */
646 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
647 end = raw + (offset - pstr->mb_cur_max);
648 p = raw + offset - 1;
650 /* We know the wchar_t encoding is UCS4, so for the simple
651 case, ASCII characters, skip the conversion step. */
652 if (isascii (*p) && BE (pstr->trans == NULL, 1))
654 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
660 for (; p >= end; --p)
661 if ((*p & 0xc0) != 0x80)
665 Idx mlen = raw + pstr->len - p;
666 unsigned char buf[6];
670 if (BE (pstr->trans != NULL, 0))
672 int i = mlen < 6 ? mlen : 6;
674 buf[i] = pstr->trans[p[i]];
677 /* XXX Don't use mbrtowc, we know which conversion
678 to use (UTF-8 -> UCS4). */
679 memset (&cur_state, 0, sizeof (cur_state));
680 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
682 if (raw + offset - p <= mbclen
683 && mbclen < (size_t) -2)
685 memset (&pstr->cur_state, '\0',
687 pstr->valid_len = mbclen - (raw + offset - p);
695 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
698 = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
700 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
701 && IS_WIDE_WORD_CHAR (wc))
703 : ((IS_WIDE_NEWLINE (wc)
704 && pstr->newline_anchor)
705 ? CONTEXT_NEWLINE : 0));
706 if (BE (pstr->valid_len, 0))
708 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
709 pstr->wcs[wcs_idx] = WEOF;
710 if (pstr->mbs_allocated)
711 memset (pstr->mbs, -1, pstr->valid_len);
713 pstr->valid_raw_len = pstr->valid_len;
716 #endif /* RE_ENABLE_I18N */
718 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
719 pstr->valid_raw_len = 0;
722 pstr->tip_context = (bitset_contain (pstr->word_char, c)
724 : ((IS_NEWLINE (c) && pstr->newline_anchor)
725 ? CONTEXT_NEWLINE : 0));
728 if (!BE (pstr->mbs_allocated, 0))
731 pstr->raw_mbs_idx = idx;
733 pstr->stop -= offset;
735 /* Then build the buffers. */
736 #ifdef RE_ENABLE_I18N
737 if (pstr->mb_cur_max > 1)
741 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
742 if (BE (ret != REG_NOERROR, 0))
746 build_wcs_buffer (pstr);
749 #endif /* RE_ENABLE_I18N */
750 if (BE (pstr->mbs_allocated, 0))
753 build_upper_buffer (pstr);
754 else if (pstr->trans != NULL)
755 re_string_translate_buffer (pstr);
758 pstr->valid_len = pstr->len;
765 internal_function __attribute ((pure))
766 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
771 /* Handle the common (easiest) cases first. */
772 if (BE (!pstr->mbs_allocated, 1))
773 return re_string_peek_byte (pstr, idx);
775 #ifdef RE_ENABLE_I18N
776 if (pstr->mb_cur_max > 1
777 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
778 return re_string_peek_byte (pstr, idx);
781 off = pstr->cur_idx + idx;
782 #ifdef RE_ENABLE_I18N
783 if (pstr->offsets_needed)
784 off = pstr->offsets[off];
787 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
789 #ifdef RE_ENABLE_I18N
790 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
791 this function returns CAPITAL LETTER I instead of first byte of
792 DOTLESS SMALL LETTER I. The latter would confuse the parser,
793 since peek_byte_case doesn't advance cur_idx in any way. */
794 if (pstr->offsets_needed && !isascii (ch))
795 return re_string_peek_byte (pstr, idx);
802 internal_function __attribute ((pure))
803 re_string_fetch_byte_case (re_string_t *pstr)
805 if (BE (!pstr->mbs_allocated, 1))
806 return re_string_fetch_byte (pstr);
808 #ifdef RE_ENABLE_I18N
809 if (pstr->offsets_needed)
814 /* For tr_TR.UTF-8 [[:islower:]] there is
815 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
816 in that case the whole multi-byte character and return
817 the original letter. On the other side, with
818 [[: DOTLESS SMALL LETTER I return [[:I, as doing
819 anything else would complicate things too much. */
821 if (!re_string_first_byte (pstr, pstr->cur_idx))
822 return re_string_fetch_byte (pstr);
824 off = pstr->offsets[pstr->cur_idx];
825 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
828 return re_string_fetch_byte (pstr);
830 re_string_skip_bytes (pstr,
831 re_string_char_size_at (pstr, pstr->cur_idx));
836 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
841 re_string_destruct (re_string_t *pstr)
843 #ifdef RE_ENABLE_I18N
845 re_free (pstr->offsets);
846 #endif /* RE_ENABLE_I18N */
847 if (pstr->mbs_allocated)
851 /* Return the context at IDX in INPUT. */
855 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
858 if (BE (! REG_VALID_INDEX (idx), 0))
859 /* In this case, we use the value stored in input->tip_context,
860 since we can't know the character in input->mbs[-1] here. */
861 return input->tip_context;
862 if (BE (idx == input->len, 0))
863 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
864 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
865 #ifdef RE_ENABLE_I18N
866 if (input->mb_cur_max > 1)
870 while(input->wcs[wc_idx] == WEOF)
873 /* It must not happen. */
874 assert (REG_VALID_INDEX (wc_idx));
877 if (! REG_VALID_INDEX (wc_idx))
878 return input->tip_context;
880 wc = input->wcs[wc_idx];
881 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
883 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
884 ? CONTEXT_NEWLINE : 0);
889 c = re_string_byte_at (input, idx);
890 if (bitset_contain (input->word_char, c))
892 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
896 /* Functions for set operation. */
900 re_node_set_alloc (re_node_set *set, Idx size)
904 set->elems = re_malloc (Idx, size);
905 if (BE (set->elems == NULL, 0))
912 re_node_set_init_1 (re_node_set *set, Idx elem)
916 set->elems = re_malloc (Idx, 1);
917 if (BE (set->elems == NULL, 0))
919 set->alloc = set->nelem = 0;
922 set->elems[0] = elem;
928 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
931 set->elems = re_malloc (Idx, 2);
932 if (BE (set->elems == NULL, 0))
937 set->elems[0] = elem1;
944 set->elems[0] = elem1;
945 set->elems[1] = elem2;
949 set->elems[0] = elem2;
950 set->elems[1] = elem1;
958 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
960 dest->nelem = src->nelem;
963 dest->alloc = dest->nelem;
964 dest->elems = re_malloc (Idx, dest->alloc);
965 if (BE (dest->elems == NULL, 0))
967 dest->alloc = dest->nelem = 0;
970 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
973 re_node_set_init_empty (dest);
977 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
978 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
979 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
983 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
984 const re_node_set *src2)
986 Idx i1, i2, is, id, delta, sbase;
987 if (src1->nelem == 0 || src2->nelem == 0)
990 /* We need dest->nelem + 2 * elems_in_intersection; this is a
991 conservative estimate. */
992 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
994 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
995 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
996 if (BE (new_elems == NULL, 0))
998 dest->elems = new_elems;
999 dest->alloc = new_alloc;
1002 /* Find the items in the intersection of SRC1 and SRC2, and copy
1003 into the top of DEST those that are not already in DEST itself. */
1004 sbase = dest->nelem + src1->nelem + src2->nelem;
1005 i1 = src1->nelem - 1;
1006 i2 = src2->nelem - 1;
1007 id = dest->nelem - 1;
1010 if (src1->elems[i1] == src2->elems[i2])
1012 /* Try to find the item in DEST. Maybe we could binary search? */
1013 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1016 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1017 dest->elems[--sbase] = src1->elems[i1];
1019 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1023 /* Lower the highest of the two items. */
1024 else if (src1->elems[i1] < src2->elems[i2])
1026 if (! REG_VALID_INDEX (--i2))
1031 if (! REG_VALID_INDEX (--i1))
1036 id = dest->nelem - 1;
1037 is = dest->nelem + src1->nelem + src2->nelem - 1;
1038 delta = is - sbase + 1;
1040 /* Now copy. When DELTA becomes zero, the remaining
1041 DEST elements are already in place; this is more or
1042 less the same loop that is in re_node_set_merge. */
1043 dest->nelem += delta;
1044 if (delta > 0 && REG_VALID_INDEX (id))
1047 if (dest->elems[is] > dest->elems[id])
1049 /* Copy from the top. */
1050 dest->elems[id + delta--] = dest->elems[is--];
1056 /* Slide from the bottom. */
1057 dest->elems[id + delta] = dest->elems[id];
1058 if (! REG_VALID_INDEX (--id))
1063 /* Copy remaining SRC elements. */
1064 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1069 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1070 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1072 static reg_errcode_t
1074 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1075 const re_node_set *src2)
1078 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1080 dest->alloc = src1->nelem + src2->nelem;
1081 dest->elems = re_malloc (Idx, dest->alloc);
1082 if (BE (dest->elems == NULL, 0))
1087 if (src1 != NULL && src1->nelem > 0)
1088 return re_node_set_init_copy (dest, src1);
1089 else if (src2 != NULL && src2->nelem > 0)
1090 return re_node_set_init_copy (dest, src2);
1092 re_node_set_init_empty (dest);
1095 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1097 if (src1->elems[i1] > src2->elems[i2])
1099 dest->elems[id++] = src2->elems[i2++];
1102 if (src1->elems[i1] == src2->elems[i2])
1104 dest->elems[id++] = src1->elems[i1++];
1106 if (i1 < src1->nelem)
1108 memcpy (dest->elems + id, src1->elems + i1,
1109 (src1->nelem - i1) * sizeof (Idx));
1110 id += src1->nelem - i1;
1112 else if (i2 < src2->nelem)
1114 memcpy (dest->elems + id, src2->elems + i2,
1115 (src2->nelem - i2) * sizeof (Idx));
1116 id += src2->nelem - i2;
1122 /* Calculate the union set of the sets DEST and SRC. And store it to
1123 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1125 static reg_errcode_t
1127 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1129 Idx is, id, sbase, delta;
1130 if (src == NULL || src->nelem == 0)
1132 if (dest->alloc < 2 * src->nelem + dest->nelem)
1134 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1135 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1136 if (BE (new_buffer == NULL, 0))
1138 dest->elems = new_buffer;
1139 dest->alloc = new_alloc;
1142 if (BE (dest->nelem == 0, 0))
1144 dest->nelem = src->nelem;
1145 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1149 /* Copy into the top of DEST the items of SRC that are not
1150 found in DEST. Maybe we could binary search in DEST? */
1151 for (sbase = dest->nelem + 2 * src->nelem,
1152 is = src->nelem - 1, id = dest->nelem - 1;
1153 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1155 if (dest->elems[id] == src->elems[is])
1157 else if (dest->elems[id] < src->elems[is])
1158 dest->elems[--sbase] = src->elems[is--];
1159 else /* if (dest->elems[id] > src->elems[is]) */
1163 if (REG_VALID_INDEX (is))
1165 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1167 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1170 id = dest->nelem - 1;
1171 is = dest->nelem + 2 * src->nelem - 1;
1172 delta = is - sbase + 1;
1176 /* Now copy. When DELTA becomes zero, the remaining
1177 DEST elements are already in place. */
1178 dest->nelem += delta;
1181 if (dest->elems[is] > dest->elems[id])
1183 /* Copy from the top. */
1184 dest->elems[id + delta--] = dest->elems[is--];
1190 /* Slide from the bottom. */
1191 dest->elems[id + delta] = dest->elems[id];
1192 if (! REG_VALID_INDEX (--id))
1194 /* Copy remaining SRC elements. */
1195 memcpy (dest->elems, dest->elems + sbase,
1196 delta * sizeof (Idx));
1205 /* Insert the new element ELEM to the re_node_set* SET.
1206 SET should not already have ELEM.
1207 Return true if successful. */
1211 re_node_set_insert (re_node_set *set, Idx elem)
1214 /* In case the set is empty. */
1215 if (set->alloc == 0)
1216 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1218 if (BE (set->nelem, 0) == 0)
1220 /* We already guaranteed above that set->alloc != 0. */
1221 set->elems[0] = elem;
1226 /* Realloc if we need. */
1227 if (set->alloc == set->nelem)
1230 set->alloc = set->alloc * 2;
1231 new_elems = re_realloc (set->elems, Idx, set->alloc);
1232 if (BE (new_elems == NULL, 0))
1234 set->elems = new_elems;
1237 /* Move the elements which follows the new element. Test the
1238 first element separately to skip a check in the inner loop. */
1239 if (elem < set->elems[0])
1242 for (idx = set->nelem; idx > 0; idx--)
1243 set->elems[idx] = set->elems[idx - 1];
1247 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1248 set->elems[idx] = set->elems[idx - 1];
1251 /* Insert the new element. */
1252 set->elems[idx] = elem;
1257 /* Insert the new element ELEM to the re_node_set* SET.
1258 SET should not already have any element greater than or equal to ELEM.
1259 Return true if successful. */
1263 re_node_set_insert_last (re_node_set *set, Idx elem)
1265 /* Realloc if we need. */
1266 if (set->alloc == set->nelem)
1269 set->alloc = (set->alloc + 1) * 2;
1270 new_elems = re_realloc (set->elems, Idx, set->alloc);
1271 if (BE (new_elems == NULL, 0))
1273 set->elems = new_elems;
1276 /* Insert the new element. */
1277 set->elems[set->nelem++] = elem;
1281 /* Compare two node sets SET1 and SET2.
1282 Return true if SET1 and SET2 are equivalent. */
1285 internal_function __attribute ((pure))
1286 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1289 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1291 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1292 if (set1->elems[i] != set2->elems[i])
1297 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1300 internal_function __attribute ((pure))
1301 re_node_set_contains (const re_node_set *set, Idx elem)
1303 __re_size_t idx, right, mid;
1304 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1307 /* Binary search the element. */
1309 right = set->nelem - 1;
1312 mid = (idx + right) / 2;
1313 if (set->elems[mid] < elem)
1318 return set->elems[idx] == elem ? idx + 1 : 0;
1323 re_node_set_remove_at (re_node_set *set, Idx idx)
1325 if (idx < 0 || idx >= set->nelem)
1328 for (; idx < set->nelem; idx++)
1329 set->elems[idx] = set->elems[idx + 1];
1333 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1334 Or return REG_MISSING if an error occurred. */
1338 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1340 int type = token.type;
1341 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1343 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1344 Idx *new_nexts, *new_indices;
1345 re_node_set *new_edests, *new_eclosures;
1346 re_token_t *new_nodes;
1347 size_t max_object_size =
1348 MAX (sizeof (re_token_t),
1349 MAX (sizeof (re_node_set),
1352 /* Avoid overflows. */
1353 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1356 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1357 if (BE (new_nodes == NULL, 0))
1359 dfa->nodes = new_nodes;
1360 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1361 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1362 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1363 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1364 if (BE (new_nexts == NULL || new_indices == NULL
1365 || new_edests == NULL || new_eclosures == NULL, 0))
1367 dfa->nexts = new_nexts;
1368 dfa->org_indices = new_indices;
1369 dfa->edests = new_edests;
1370 dfa->eclosures = new_eclosures;
1371 dfa->nodes_alloc = new_nodes_alloc;
1373 dfa->nodes[dfa->nodes_len] = token;
1374 dfa->nodes[dfa->nodes_len].constraint = 0;
1375 #ifdef RE_ENABLE_I18N
1376 dfa->nodes[dfa->nodes_len].accept_mb =
1377 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1379 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1380 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1381 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1382 return dfa->nodes_len++;
1385 static inline re_hashval_t
1387 calc_state_hash (const re_node_set *nodes, unsigned int context)
1389 re_hashval_t hash = nodes->nelem + context;
1391 for (i = 0 ; i < nodes->nelem ; i++)
1392 hash += nodes->elems[i];
1396 /* Search for the state whose node_set is equivalent to NODES.
1397 Return the pointer to the state, if we found it in the DFA.
1398 Otherwise create the new one and return it. In case of an error
1399 return NULL and set the error code in ERR.
1400 Note: - We assume NULL as the invalid state, then it is possible that
1401 return value is NULL and ERR is REG_NOERROR.
1402 - We never return non-NULL value in case of any errors, it is for
1405 static re_dfastate_t *
1407 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1408 const re_node_set *nodes)
1411 re_dfastate_t *new_state;
1412 struct re_state_table_entry *spot;
1415 /* Suppress bogus uninitialized-variable warnings. */
1418 if (BE (nodes->nelem == 0, 0))
1423 hash = calc_state_hash (nodes, 0);
1424 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1426 for (i = 0 ; i < spot->num ; i++)
1428 re_dfastate_t *state = spot->array[i];
1429 if (hash != state->hash)
1431 if (re_node_set_compare (&state->nodes, nodes))
1435 /* There are no appropriate state in the dfa, create the new one. */
1436 new_state = create_ci_newstate (dfa, nodes, hash);
1437 if (BE (new_state == NULL, 0))
1443 /* Search for the state whose node_set is equivalent to NODES and
1444 whose context is equivalent to CONTEXT.
1445 Return the pointer to the state, if we found it in the DFA.
1446 Otherwise create the new one and return it. In case of an error
1447 return NULL and set the error code in ERR.
1448 Note: - We assume NULL as the invalid state, then it is possible that
1449 return value is NULL and ERR is REG_NOERROR.
1450 - We never return non-NULL value in case of any errors, it is for
1453 static re_dfastate_t *
1455 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1456 const re_node_set *nodes, unsigned int context)
1459 re_dfastate_t *new_state;
1460 struct re_state_table_entry *spot;
1463 /* Suppress bogus uninitialized-variable warnings. */
1466 if (nodes->nelem == 0)
1471 hash = calc_state_hash (nodes, context);
1472 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1474 for (i = 0 ; i < spot->num ; i++)
1476 re_dfastate_t *state = spot->array[i];
1477 if (state->hash == hash
1478 && state->context == context
1479 && re_node_set_compare (state->entrance_nodes, nodes))
1482 /* There are no appropriate state in `dfa', create the new one. */
1483 new_state = create_cd_newstate (dfa, nodes, context, hash);
1484 if (BE (new_state == NULL, 0))
1490 /* Finish initialization of the new state NEWSTATE, and using its hash value
1491 HASH put in the appropriate bucket of DFA's state table. Return value
1492 indicates the error code if failed. */
1494 static reg_errcode_t
1495 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1498 struct re_state_table_entry *spot;
1502 newstate->hash = hash;
1503 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1504 if (BE (err != REG_NOERROR, 0))
1506 for (i = 0; i < newstate->nodes.nelem; i++)
1508 Idx elem = newstate->nodes.elems[i];
1509 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1510 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1514 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1515 if (BE (spot->alloc <= spot->num, 0))
1517 Idx new_alloc = 2 * spot->num + 2;
1518 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1520 if (BE (new_array == NULL, 0))
1522 spot->array = new_array;
1523 spot->alloc = new_alloc;
1525 spot->array[spot->num++] = newstate;
1530 free_state (re_dfastate_t *state)
1532 re_node_set_free (&state->non_eps_nodes);
1533 re_node_set_free (&state->inveclosure);
1534 if (state->entrance_nodes != &state->nodes)
1536 re_node_set_free (state->entrance_nodes);
1537 re_free (state->entrance_nodes);
1539 re_node_set_free (&state->nodes);
1540 re_free (state->word_trtable);
1541 re_free (state->trtable);
1545 /* Create the new state which is independ of contexts.
1546 Return the new state if succeeded, otherwise return NULL. */
1548 static re_dfastate_t *
1550 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1555 re_dfastate_t *newstate;
1557 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1558 if (BE (newstate == NULL, 0))
1560 err = re_node_set_init_copy (&newstate->nodes, nodes);
1561 if (BE (err != REG_NOERROR, 0))
1567 newstate->entrance_nodes = &newstate->nodes;
1568 for (i = 0 ; i < nodes->nelem ; i++)
1570 re_token_t *node = dfa->nodes + nodes->elems[i];
1571 re_token_type_t type = node->type;
1572 if (type == CHARACTER && !node->constraint)
1574 #ifdef RE_ENABLE_I18N
1575 newstate->accept_mb |= node->accept_mb;
1576 #endif /* RE_ENABLE_I18N */
1578 /* If the state has the halt node, the state is a halt state. */
1579 if (type == END_OF_RE)
1581 else if (type == OP_BACK_REF)
1582 newstate->has_backref = 1;
1583 else if (type == ANCHOR || node->constraint)
1584 newstate->has_constraint = 1;
1586 err = register_state (dfa, newstate, hash);
1587 if (BE (err != REG_NOERROR, 0))
1589 free_state (newstate);
1595 /* Create the new state which is depend on the context CONTEXT.
1596 Return the new state if succeeded, otherwise return NULL. */
1598 static re_dfastate_t *
1600 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1601 unsigned int context, re_hashval_t hash)
1603 Idx i, nctx_nodes = 0;
1605 re_dfastate_t *newstate;
1607 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1608 if (BE (newstate == NULL, 0))
1610 err = re_node_set_init_copy (&newstate->nodes, nodes);
1611 if (BE (err != REG_NOERROR, 0))
1617 newstate->context = context;
1618 newstate->entrance_nodes = &newstate->nodes;
1620 for (i = 0 ; i < nodes->nelem ; i++)
1622 unsigned int constraint = 0;
1623 re_token_t *node = dfa->nodes + nodes->elems[i];
1624 re_token_type_t type = node->type;
1625 if (node->constraint)
1626 constraint = node->constraint;
1628 if (type == CHARACTER && !constraint)
1630 #ifdef RE_ENABLE_I18N
1631 newstate->accept_mb |= node->accept_mb;
1632 #endif /* RE_ENABLE_I18N */
1634 /* If the state has the halt node, the state is a halt state. */
1635 if (type == END_OF_RE)
1637 else if (type == OP_BACK_REF)
1638 newstate->has_backref = 1;
1639 else if (type == ANCHOR)
1640 constraint = node->opr.ctx_type;
1644 if (newstate->entrance_nodes == &newstate->nodes)
1646 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1647 if (BE (newstate->entrance_nodes == NULL, 0))
1649 free_state (newstate);
1652 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1654 newstate->has_constraint = 1;
1657 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1659 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1664 err = register_state (dfa, newstate, hash);
1665 if (BE (err != REG_NOERROR, 0))
1667 free_state (newstate);