1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static void re_string_construct_common (const char *str, Idx len,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
30 re_hashval_t hash) internal_function;
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
63 /* This function allocate the buffers, and initialize them. */
67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
85 if (dfa->mb_cur_max > 1)
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
92 if (pstr->valid_raw_len >= len)
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
111 #endif /* RE_ENABLE_I18N */
114 re_string_translate_buffer (pstr);
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
137 /* Avoid overflow. */
138 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
142 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143 if (BE (new_wcs == NULL, 0))
146 if (pstr->offsets != NULL)
148 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149 if (BE (new_offsets == NULL, 0))
151 pstr->offsets = new_offsets;
154 #endif /* RE_ENABLE_I18N */
155 if (pstr->mbs_allocated)
157 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 if (BE (new_mbs == NULL, 0))
163 pstr->bufs_len = new_buf_len;
170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171 RE_TRANSLATE_TYPE trans, bool icase,
174 pstr->raw_mbs = (const unsigned char *) str;
179 pstr->mbs_allocated = (trans != NULL || icase);
180 pstr->mb_cur_max = dfa->mb_cur_max;
181 pstr->is_utf8 = dfa->is_utf8;
182 pstr->map_notascii = dfa->map_notascii;
183 pstr->stop = pstr->len;
184 pstr->raw_stop = pstr->stop;
187 #ifdef RE_ENABLE_I18N
189 /* Build wide character buffer PSTR->WCS.
190 If the byte sequence of the string are:
191 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192 Then wide character buffer will be:
193 <wc1> , WEOF , <wc2> , WEOF , <wc3>
194 We use WEOF for padding, they indicate that the position isn't
195 a first byte of a multibyte character.
197 Note that this function assumes PSTR->VALID_LEN elements are already
198 built and starts from PSTR->VALID_LEN. */
202 build_wcs_buffer (re_string_t *pstr)
205 unsigned char buf[MB_LEN_MAX];
206 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 unsigned char buf[64];
211 Idx byte_idx, end_idx, remain_len;
214 /* Build the buffers from pstr->valid_len to either pstr->len or
216 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
222 remain_len = end_idx - byte_idx;
223 prev_st = pstr->cur_state;
224 /* Apply the translation if we need. */
225 if (BE (pstr->trans != NULL, 0))
229 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234 p = (const char *) buf;
237 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239 if (BE (mbclen == (size_t) -2, 0))
241 /* The buffer doesn't have enough space, finish to build. */
242 pstr->cur_state = prev_st;
245 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
247 /* We treat these cases as a singlebyte character. */
249 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
250 if (BE (pstr->trans != NULL, 0))
251 wc = pstr->trans[wc];
252 pstr->cur_state = prev_st;
255 /* Write wide character and padding. */
256 pstr->wcs[byte_idx++] = wc;
257 /* Write paddings. */
258 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
259 pstr->wcs[byte_idx++] = WEOF;
261 pstr->valid_len = byte_idx;
262 pstr->valid_raw_len = byte_idx;
265 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
266 but for REG_ICASE. */
270 build_wcs_upper_buffer (re_string_t *pstr)
273 Idx src_idx, byte_idx, end_idx, remain_len;
276 char buf[MB_LEN_MAX];
277 assert (MB_LEN_MAX >= pstr->mb_cur_max);
282 byte_idx = pstr->valid_len;
283 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285 /* The following optimization assumes that ASCII characters can be
286 mapped to wide characters with a simple cast. */
287 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289 while (byte_idx < end_idx)
293 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
294 && mbsinit (&pstr->cur_state))
296 /* In case of a singlebyte character. */
298 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
299 /* The next step uses the assumption that wchar_t is encoded
300 ASCII-safe: all ASCII values can be converted like this. */
301 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
306 remain_len = end_idx - byte_idx;
307 prev_st = pstr->cur_state;
308 mbclen = mbrtowc (&wc,
309 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
310 + byte_idx), remain_len, &pstr->cur_state);
311 if (BE (mbclen < (size_t) -2, 1))
319 mbcdlen = wcrtomb (buf, wcu, &prev_st);
320 if (BE (mbclen == mbcdlen, 1))
321 memcpy (pstr->mbs + byte_idx, buf, mbclen);
329 memcpy (pstr->mbs + byte_idx,
330 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331 pstr->wcs[byte_idx++] = wcu;
332 /* Write paddings. */
333 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334 pstr->wcs[byte_idx++] = WEOF;
336 else if (mbclen == (size_t) -1 || mbclen == 0)
338 /* It is an invalid character or '\0'. Just use the byte. */
339 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
340 pstr->mbs[byte_idx] = ch;
341 /* And also cast it to wide char. */
342 pstr->wcs[byte_idx++] = (wchar_t) ch;
343 if (BE (mbclen == (size_t) -1, 0))
344 pstr->cur_state = prev_st;
348 /* The buffer doesn't have enough space, finish to build. */
349 pstr->cur_state = prev_st;
353 pstr->valid_len = byte_idx;
354 pstr->valid_raw_len = byte_idx;
358 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
363 remain_len = end_idx - byte_idx;
364 prev_st = pstr->cur_state;
365 if (BE (pstr->trans != NULL, 0))
369 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
371 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
372 buf[i] = pstr->trans[ch];
374 p = (const char *) buf;
377 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
378 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
379 if (BE (mbclen < (size_t) -2, 1))
387 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
388 if (BE (mbclen == mbcdlen, 1))
389 memcpy (pstr->mbs + byte_idx, buf, mbclen);
390 else if (mbcdlen != (size_t) -1)
394 if (byte_idx + mbcdlen > pstr->bufs_len)
396 pstr->cur_state = prev_st;
400 if (pstr->offsets == NULL)
402 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404 if (pstr->offsets == NULL)
407 if (!pstr->offsets_needed)
409 for (i = 0; i < (size_t) byte_idx; ++i)
410 pstr->offsets[i] = i;
411 pstr->offsets_needed = 1;
414 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
415 pstr->wcs[byte_idx] = wcu;
416 pstr->offsets[byte_idx] = src_idx;
417 for (i = 1; i < mbcdlen; ++i)
419 pstr->offsets[byte_idx + i]
420 = src_idx + (i < mbclen ? i : mbclen - 1);
421 pstr->wcs[byte_idx + i] = WEOF;
423 pstr->len += mbcdlen - mbclen;
424 if (pstr->raw_stop > src_idx)
425 pstr->stop += mbcdlen - mbclen;
426 end_idx = (pstr->bufs_len > pstr->len)
427 ? pstr->len : pstr->bufs_len;
433 memcpy (pstr->mbs + byte_idx, p, mbclen);
436 memcpy (pstr->mbs + byte_idx, p, mbclen);
438 if (BE (pstr->offsets_needed != 0, 0))
441 for (i = 0; i < mbclen; ++i)
442 pstr->offsets[byte_idx + i] = src_idx + i;
446 pstr->wcs[byte_idx++] = wcu;
447 /* Write paddings. */
448 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
449 pstr->wcs[byte_idx++] = WEOF;
451 else if (mbclen == (size_t) -1 || mbclen == 0)
453 /* It is an invalid character or '\0'. Just use the byte. */
454 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
456 if (BE (pstr->trans != NULL, 0))
457 ch = pstr->trans [ch];
458 pstr->mbs[byte_idx] = ch;
460 if (BE (pstr->offsets_needed != 0, 0))
461 pstr->offsets[byte_idx] = src_idx;
464 /* And also cast it to wide char. */
465 pstr->wcs[byte_idx++] = (wchar_t) ch;
466 if (BE (mbclen == (size_t) -1, 0))
467 pstr->cur_state = prev_st;
471 /* The buffer doesn't have enough space, finish to build. */
472 pstr->cur_state = prev_st;
476 pstr->valid_len = byte_idx;
477 pstr->valid_raw_len = src_idx;
481 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
486 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
493 /* Skip the characters which are not necessary to check. */
494 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
495 rawbuf_idx < new_raw_idx;)
498 remain_len = pstr->len - rawbuf_idx;
499 prev_st = pstr->cur_state;
500 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
501 remain_len, &pstr->cur_state);
502 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
504 /* We treat these cases as a singlebyte character. */
506 pstr->cur_state = prev_st;
508 /* Then proceed the next character. */
509 rawbuf_idx += mbclen;
511 *last_wc = (wint_t) wc;
514 #endif /* RE_ENABLE_I18N */
516 /* Build the buffer PSTR->MBS, and apply the translation if we need.
517 This function is used in case of REG_ICASE. */
521 build_upper_buffer (re_string_t *pstr)
523 Idx char_idx, end_idx;
524 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
526 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
528 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
529 if (BE (pstr->trans != NULL, 0))
530 ch = pstr->trans[ch];
532 pstr->mbs[char_idx] = toupper (ch);
534 pstr->mbs[char_idx] = ch;
536 pstr->valid_len = char_idx;
537 pstr->valid_raw_len = char_idx;
540 /* Apply TRANS to the buffer in PSTR. */
544 re_string_translate_buffer (re_string_t *pstr)
546 Idx buf_idx, end_idx;
547 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
549 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
551 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
552 pstr->mbs[buf_idx] = pstr->trans[ch];
555 pstr->valid_len = buf_idx;
556 pstr->valid_raw_len = buf_idx;
559 /* This function re-construct the buffers.
560 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
561 convert to upper case in case of REG_ICASE, apply translation. */
565 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
569 if (BE (pstr->raw_mbs_idx <= idx, 0))
570 offset = idx - pstr->raw_mbs_idx;
574 #ifdef RE_ENABLE_I18N
575 if (pstr->mb_cur_max > 1)
576 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
577 #endif /* RE_ENABLE_I18N */
578 pstr->len = pstr->raw_len;
579 pstr->stop = pstr->raw_stop;
581 pstr->raw_mbs_idx = 0;
582 pstr->valid_raw_len = 0;
583 pstr->offsets_needed = 0;
584 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
585 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
586 if (!pstr->mbs_allocated)
587 pstr->mbs = (unsigned char *) pstr->raw_mbs;
591 if (BE (offset != 0, 1))
593 /* Are the characters which are already checked remain? */
594 if (BE (offset < pstr->valid_raw_len, 1)
595 #ifdef RE_ENABLE_I18N
596 /* Handling this would enlarge the code too much.
597 Accept a slowdown in that case. */
598 && pstr->offsets_needed == 0
602 /* Yes, move them to the front of the buffer. */
603 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
604 #ifdef RE_ENABLE_I18N
605 if (pstr->mb_cur_max > 1)
606 memmove (pstr->wcs, pstr->wcs + offset,
607 (pstr->valid_len - offset) * sizeof (wint_t));
608 #endif /* RE_ENABLE_I18N */
609 if (BE (pstr->mbs_allocated, 0))
610 memmove (pstr->mbs, pstr->mbs + offset,
611 pstr->valid_len - offset);
612 pstr->valid_len -= offset;
613 pstr->valid_raw_len -= offset;
615 assert (pstr->valid_len > 0);
620 /* No, skip all characters until IDX. */
621 #ifdef RE_ENABLE_I18N
622 if (BE (pstr->offsets_needed, 0))
624 pstr->len = pstr->raw_len - idx + offset;
625 pstr->stop = pstr->raw_stop - idx + offset;
626 pstr->offsets_needed = 0;
630 pstr->valid_raw_len = 0;
631 #ifdef RE_ENABLE_I18N
632 if (pstr->mb_cur_max > 1)
639 const unsigned char *raw, *p, *q, *end;
641 /* Special case UTF-8. Multi-byte chars start with any
642 byte other than 0x80 - 0xbf. */
643 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
644 end = raw + (offset - pstr->mb_cur_max);
645 p = raw + offset - 1;
647 /* We know the wchar_t encoding is UCS4, so for the simple
648 case, ASCII characters, skip the conversion step. */
649 if (isascii (*p) && BE (pstr->trans == NULL, 1))
651 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
657 for (; p >= end; --p)
658 if ((*p & 0xc0) != 0x80)
662 Idx mlen = raw + pstr->len - p;
663 unsigned char buf[6];
667 if (BE (pstr->trans != NULL, 0))
669 int i = mlen < 6 ? mlen : 6;
671 buf[i] = pstr->trans[p[i]];
674 /* XXX Don't use mbrtowc, we know which conversion
675 to use (UTF-8 -> UCS4). */
676 memset (&cur_state, 0, sizeof (cur_state));
677 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
679 if (raw + offset - p <= mbclen
680 && mbclen < (size_t) -2)
682 memset (&pstr->cur_state, '\0',
684 pstr->valid_len = mbclen - (raw + offset - p);
692 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
693 if (BE (pstr->valid_len, 0))
695 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
696 pstr->wcs[wcs_idx] = WEOF;
697 if (pstr->mbs_allocated)
698 memset (pstr->mbs, -1, pstr->valid_len);
700 pstr->valid_raw_len = pstr->valid_len;
701 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
702 && IS_WIDE_WORD_CHAR (wc))
704 : ((IS_WIDE_NEWLINE (wc)
705 && pstr->newline_anchor)
706 ? CONTEXT_NEWLINE : 0));
709 #endif /* RE_ENABLE_I18N */
711 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
714 pstr->tip_context = (bitset_contain (pstr->word_char, c)
716 : ((IS_NEWLINE (c) && pstr->newline_anchor)
717 ? CONTEXT_NEWLINE : 0));
720 if (!BE (pstr->mbs_allocated, 0))
723 pstr->raw_mbs_idx = idx;
725 pstr->stop -= offset;
727 /* Then build the buffers. */
728 #ifdef RE_ENABLE_I18N
729 if (pstr->mb_cur_max > 1)
733 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
734 if (BE (ret != REG_NOERROR, 0))
738 build_wcs_buffer (pstr);
741 #endif /* RE_ENABLE_I18N */
742 if (BE (pstr->mbs_allocated, 0))
745 build_upper_buffer (pstr);
746 else if (pstr->trans != NULL)
747 re_string_translate_buffer (pstr);
750 pstr->valid_len = pstr->len;
757 internal_function __attribute ((pure))
758 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
763 /* Handle the common (easiest) cases first. */
764 if (BE (!pstr->mbs_allocated, 1))
765 return re_string_peek_byte (pstr, idx);
767 #ifdef RE_ENABLE_I18N
768 if (pstr->mb_cur_max > 1
769 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
770 return re_string_peek_byte (pstr, idx);
773 off = pstr->cur_idx + idx;
774 #ifdef RE_ENABLE_I18N
775 if (pstr->offsets_needed)
776 off = pstr->offsets[off];
779 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
781 #ifdef RE_ENABLE_I18N
782 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
783 this function returns CAPITAL LETTER I instead of first byte of
784 DOTLESS SMALL LETTER I. The latter would confuse the parser,
785 since peek_byte_case doesn't advance cur_idx in any way. */
786 if (pstr->offsets_needed && !isascii (ch))
787 return re_string_peek_byte (pstr, idx);
794 internal_function __attribute ((pure))
795 re_string_fetch_byte_case (re_string_t *pstr)
797 if (BE (!pstr->mbs_allocated, 1))
798 return re_string_fetch_byte (pstr);
800 #ifdef RE_ENABLE_I18N
801 if (pstr->offsets_needed)
806 /* For tr_TR.UTF-8 [[:islower:]] there is
807 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
808 in that case the whole multi-byte character and return
809 the original letter. On the other side, with
810 [[: DOTLESS SMALL LETTER I return [[:I, as doing
811 anything else would complicate things too much. */
813 if (!re_string_first_byte (pstr, pstr->cur_idx))
814 return re_string_fetch_byte (pstr);
816 off = pstr->offsets[pstr->cur_idx];
817 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
820 return re_string_fetch_byte (pstr);
822 re_string_skip_bytes (pstr,
823 re_string_char_size_at (pstr, pstr->cur_idx));
828 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
833 re_string_destruct (re_string_t *pstr)
835 #ifdef RE_ENABLE_I18N
837 re_free (pstr->offsets);
838 #endif /* RE_ENABLE_I18N */
839 if (pstr->mbs_allocated)
843 /* Return the context at IDX in INPUT. */
847 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
850 if (BE (! REG_VALID_INDEX (idx), 0))
851 /* In this case, we use the value stored in input->tip_context,
852 since we can't know the character in input->mbs[-1] here. */
853 return input->tip_context;
854 if (BE (idx == input->len, 0))
855 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
856 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
857 #ifdef RE_ENABLE_I18N
858 if (input->mb_cur_max > 1)
862 while(input->wcs[wc_idx] == WEOF)
865 /* It must not happen. */
866 assert (REG_VALID_INDEX (wc_idx));
869 if (! REG_VALID_INDEX (wc_idx))
870 return input->tip_context;
872 wc = input->wcs[wc_idx];
873 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
875 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
876 ? CONTEXT_NEWLINE : 0);
881 c = re_string_byte_at (input, idx);
882 if (bitset_contain (input->word_char, c))
884 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
888 /* Functions for set operation. */
892 re_node_set_alloc (re_node_set *set, Idx size)
896 set->elems = re_malloc (Idx, size);
897 if (BE (set->elems == NULL, 0))
904 re_node_set_init_1 (re_node_set *set, Idx elem)
908 set->elems = re_malloc (Idx, 1);
909 if (BE (set->elems == NULL, 0))
911 set->alloc = set->nelem = 0;
914 set->elems[0] = elem;
920 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
923 set->elems = re_malloc (Idx, 2);
924 if (BE (set->elems == NULL, 0))
929 set->elems[0] = elem1;
936 set->elems[0] = elem1;
937 set->elems[1] = elem2;
941 set->elems[0] = elem2;
942 set->elems[1] = elem1;
950 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
952 dest->nelem = src->nelem;
955 dest->alloc = dest->nelem;
956 dest->elems = re_malloc (Idx, dest->alloc);
957 if (BE (dest->elems == NULL, 0))
959 dest->alloc = dest->nelem = 0;
962 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
965 re_node_set_init_empty (dest);
969 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
970 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
971 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
975 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
976 const re_node_set *src2)
978 Idx i1, i2, is, id, delta, sbase;
979 if (src1->nelem == 0 || src2->nelem == 0)
982 /* We need dest->nelem + 2 * elems_in_intersection; this is a
983 conservative estimate. */
984 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
986 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
987 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
988 if (BE (new_elems == NULL, 0))
990 dest->elems = new_elems;
991 dest->alloc = new_alloc;
994 /* Find the items in the intersection of SRC1 and SRC2, and copy
995 into the top of DEST those that are not already in DEST itself. */
996 sbase = dest->nelem + src1->nelem + src2->nelem;
997 i1 = src1->nelem - 1;
998 i2 = src2->nelem - 1;
999 id = dest->nelem - 1;
1002 if (src1->elems[i1] == src2->elems[i2])
1004 /* Try to find the item in DEST. Maybe we could binary search? */
1005 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1008 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1009 dest->elems[--sbase] = src1->elems[i1];
1011 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1015 /* Lower the highest of the two items. */
1016 else if (src1->elems[i1] < src2->elems[i2])
1018 if (! REG_VALID_INDEX (--i2))
1023 if (! REG_VALID_INDEX (--i1))
1028 id = dest->nelem - 1;
1029 is = dest->nelem + src1->nelem + src2->nelem - 1;
1030 delta = is - sbase + 1;
1032 /* Now copy. When DELTA becomes zero, the remaining
1033 DEST elements are already in place; this is more or
1034 less the same loop that is in re_node_set_merge. */
1035 dest->nelem += delta;
1036 if (delta > 0 && REG_VALID_INDEX (id))
1039 if (dest->elems[is] > dest->elems[id])
1041 /* Copy from the top. */
1042 dest->elems[id + delta--] = dest->elems[is--];
1048 /* Slide from the bottom. */
1049 dest->elems[id + delta] = dest->elems[id];
1050 if (! REG_VALID_INDEX (--id))
1055 /* Copy remaining SRC elements. */
1056 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1061 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1062 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1064 static reg_errcode_t
1066 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1067 const re_node_set *src2)
1070 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1072 dest->alloc = src1->nelem + src2->nelem;
1073 dest->elems = re_malloc (Idx, dest->alloc);
1074 if (BE (dest->elems == NULL, 0))
1079 if (src1 != NULL && src1->nelem > 0)
1080 return re_node_set_init_copy (dest, src1);
1081 else if (src2 != NULL && src2->nelem > 0)
1082 return re_node_set_init_copy (dest, src2);
1084 re_node_set_init_empty (dest);
1087 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1089 if (src1->elems[i1] > src2->elems[i2])
1091 dest->elems[id++] = src2->elems[i2++];
1094 if (src1->elems[i1] == src2->elems[i2])
1096 dest->elems[id++] = src1->elems[i1++];
1098 if (i1 < src1->nelem)
1100 memcpy (dest->elems + id, src1->elems + i1,
1101 (src1->nelem - i1) * sizeof (Idx));
1102 id += src1->nelem - i1;
1104 else if (i2 < src2->nelem)
1106 memcpy (dest->elems + id, src2->elems + i2,
1107 (src2->nelem - i2) * sizeof (Idx));
1108 id += src2->nelem - i2;
1114 /* Calculate the union set of the sets DEST and SRC. And store it to
1115 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1117 static reg_errcode_t
1119 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1121 Idx is, id, sbase, delta;
1122 if (src == NULL || src->nelem == 0)
1124 if (dest->alloc < 2 * src->nelem + dest->nelem)
1126 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1127 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1128 if (BE (new_buffer == NULL, 0))
1130 dest->elems = new_buffer;
1131 dest->alloc = new_alloc;
1134 if (BE (dest->nelem == 0, 0))
1136 dest->nelem = src->nelem;
1137 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1141 /* Copy into the top of DEST the items of SRC that are not
1142 found in DEST. Maybe we could binary search in DEST? */
1143 for (sbase = dest->nelem + 2 * src->nelem,
1144 is = src->nelem - 1, id = dest->nelem - 1;
1145 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1147 if (dest->elems[id] == src->elems[is])
1149 else if (dest->elems[id] < src->elems[is])
1150 dest->elems[--sbase] = src->elems[is--];
1151 else /* if (dest->elems[id] > src->elems[is]) */
1155 if (REG_VALID_INDEX (is))
1157 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1159 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1162 id = dest->nelem - 1;
1163 is = dest->nelem + 2 * src->nelem - 1;
1164 delta = is - sbase + 1;
1168 /* Now copy. When DELTA becomes zero, the remaining
1169 DEST elements are already in place. */
1170 dest->nelem += delta;
1173 if (dest->elems[is] > dest->elems[id])
1175 /* Copy from the top. */
1176 dest->elems[id + delta--] = dest->elems[is--];
1182 /* Slide from the bottom. */
1183 dest->elems[id + delta] = dest->elems[id];
1184 if (! REG_VALID_INDEX (--id))
1186 /* Copy remaining SRC elements. */
1187 memcpy (dest->elems, dest->elems + sbase,
1188 delta * sizeof (Idx));
1197 /* Insert the new element ELEM to the re_node_set* SET.
1198 SET should not already have ELEM.
1199 Return true if successful. */
1203 re_node_set_insert (re_node_set *set, Idx elem)
1206 /* In case the set is empty. */
1207 if (set->alloc == 0)
1208 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1210 if (BE (set->nelem, 0) == 0)
1212 /* We already guaranteed above that set->alloc != 0. */
1213 set->elems[0] = elem;
1218 /* Realloc if we need. */
1219 if (set->alloc == set->nelem)
1222 set->alloc = set->alloc * 2;
1223 new_elems = re_realloc (set->elems, Idx, set->alloc);
1224 if (BE (new_elems == NULL, 0))
1226 set->elems = new_elems;
1229 /* Move the elements which follows the new element. Test the
1230 first element separately to skip a check in the inner loop. */
1231 if (elem < set->elems[0])
1234 for (idx = set->nelem; idx > 0; idx--)
1235 set->elems[idx] = set->elems[idx - 1];
1239 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1240 set->elems[idx] = set->elems[idx - 1];
1243 /* Insert the new element. */
1244 set->elems[idx] = elem;
1249 /* Insert the new element ELEM to the re_node_set* SET.
1250 SET should not already have any element greater than or equal to ELEM.
1251 Return true if successful. */
1255 re_node_set_insert_last (re_node_set *set, Idx elem)
1257 /* Realloc if we need. */
1258 if (set->alloc == set->nelem)
1261 set->alloc = (set->alloc + 1) * 2;
1262 new_elems = re_realloc (set->elems, Idx, set->alloc);
1263 if (BE (new_elems == NULL, 0))
1265 set->elems = new_elems;
1268 /* Insert the new element. */
1269 set->elems[set->nelem++] = elem;
1273 /* Compare two node sets SET1 and SET2.
1274 Return true if SET1 and SET2 are equivalent. */
1277 internal_function __attribute ((pure))
1278 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1281 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1283 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1284 if (set1->elems[i] != set2->elems[i])
1289 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1292 internal_function __attribute ((pure))
1293 re_node_set_contains (const re_node_set *set, Idx elem)
1295 __re_size_t idx, right, mid;
1296 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1299 /* Binary search the element. */
1301 right = set->nelem - 1;
1304 mid = (idx + right) / 2;
1305 if (set->elems[mid] < elem)
1310 return set->elems[idx] == elem ? idx + 1 : 0;
1315 re_node_set_remove_at (re_node_set *set, Idx idx)
1317 if (idx < 0 || idx >= set->nelem)
1320 for (; idx < set->nelem; idx++)
1321 set->elems[idx] = set->elems[idx + 1];
1325 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1326 Or return REG_MISSING if an error occurred. */
1330 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1332 int type = token.type;
1333 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1335 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1336 Idx *new_nexts, *new_indices;
1337 re_node_set *new_edests, *new_eclosures;
1338 re_token_t *new_nodes;
1339 size_t max_object_size =
1340 MAX (sizeof (re_token_t),
1341 MAX (sizeof (re_node_set),
1344 /* Avoid overflows. */
1345 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1348 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1349 if (BE (new_nodes == NULL, 0))
1351 dfa->nodes = new_nodes;
1352 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1353 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1354 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1355 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1356 if (BE (new_nexts == NULL || new_indices == NULL
1357 || new_edests == NULL || new_eclosures == NULL, 0))
1359 dfa->nexts = new_nexts;
1360 dfa->org_indices = new_indices;
1361 dfa->edests = new_edests;
1362 dfa->eclosures = new_eclosures;
1363 dfa->nodes_alloc = new_nodes_alloc;
1365 dfa->nodes[dfa->nodes_len] = token;
1366 dfa->nodes[dfa->nodes_len].constraint = 0;
1367 #ifdef RE_ENABLE_I18N
1368 dfa->nodes[dfa->nodes_len].accept_mb =
1369 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1371 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1372 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1373 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1374 return dfa->nodes_len++;
1377 static inline re_hashval_t
1379 calc_state_hash (const re_node_set *nodes, unsigned int context)
1381 re_hashval_t hash = nodes->nelem + context;
1383 for (i = 0 ; i < nodes->nelem ; i++)
1384 hash += nodes->elems[i];
1388 /* Search for the state whose node_set is equivalent to NODES.
1389 Return the pointer to the state, if we found it in the DFA.
1390 Otherwise create the new one and return it. In case of an error
1391 return NULL and set the error code in ERR.
1392 Note: - We assume NULL as the invalid state, then it is possible that
1393 return value is NULL and ERR is REG_NOERROR.
1394 - We never return non-NULL value in case of any errors, it is for
1397 static re_dfastate_t *
1399 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1400 const re_node_set *nodes)
1403 re_dfastate_t *new_state;
1404 struct re_state_table_entry *spot;
1407 /* Suppress bogus uninitialized-variable warnings. */
1410 if (BE (nodes->nelem == 0, 0))
1415 hash = calc_state_hash (nodes, 0);
1416 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1418 for (i = 0 ; i < spot->num ; i++)
1420 re_dfastate_t *state = spot->array[i];
1421 if (hash != state->hash)
1423 if (re_node_set_compare (&state->nodes, nodes))
1427 /* There are no appropriate state in the dfa, create the new one. */
1428 new_state = create_ci_newstate (dfa, nodes, hash);
1429 if (BE (new_state == NULL, 0))
1435 /* Search for the state whose node_set is equivalent to NODES and
1436 whose context is equivalent to CONTEXT.
1437 Return the pointer to the state, if we found it in the DFA.
1438 Otherwise create the new one and return it. In case of an error
1439 return NULL and set the error code in ERR.
1440 Note: - We assume NULL as the invalid state, then it is possible that
1441 return value is NULL and ERR is REG_NOERROR.
1442 - We never return non-NULL value in case of any errors, it is for
1445 static re_dfastate_t *
1447 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1448 const re_node_set *nodes, unsigned int context)
1451 re_dfastate_t *new_state;
1452 struct re_state_table_entry *spot;
1455 /* Suppress bogus uninitialized-variable warnings. */
1458 if (nodes->nelem == 0)
1463 hash = calc_state_hash (nodes, context);
1464 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1466 for (i = 0 ; i < spot->num ; i++)
1468 re_dfastate_t *state = spot->array[i];
1469 if (state->hash == hash
1470 && state->context == context
1471 && re_node_set_compare (state->entrance_nodes, nodes))
1474 /* There are no appropriate state in `dfa', create the new one. */
1475 new_state = create_cd_newstate (dfa, nodes, context, hash);
1476 if (BE (new_state == NULL, 0))
1482 /* Finish initialization of the new state NEWSTATE, and using its hash value
1483 HASH put in the appropriate bucket of DFA's state table. Return value
1484 indicates the error code if failed. */
1486 static reg_errcode_t
1487 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1490 struct re_state_table_entry *spot;
1494 newstate->hash = hash;
1495 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1496 if (BE (err != REG_NOERROR, 0))
1498 for (i = 0; i < newstate->nodes.nelem; i++)
1500 Idx elem = newstate->nodes.elems[i];
1501 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1502 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1506 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1507 if (BE (spot->alloc <= spot->num, 0))
1509 Idx new_alloc = 2 * spot->num + 2;
1510 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1512 if (BE (new_array == NULL, 0))
1514 spot->array = new_array;
1515 spot->alloc = new_alloc;
1517 spot->array[spot->num++] = newstate;
1522 free_state (re_dfastate_t *state)
1524 re_node_set_free (&state->non_eps_nodes);
1525 re_node_set_free (&state->inveclosure);
1526 if (state->entrance_nodes != &state->nodes)
1528 re_node_set_free (state->entrance_nodes);
1529 re_free (state->entrance_nodes);
1531 re_node_set_free (&state->nodes);
1532 re_free (state->word_trtable);
1533 re_free (state->trtable);
1537 /* Create the new state which is independ of contexts.
1538 Return the new state if succeeded, otherwise return NULL. */
1540 static re_dfastate_t *
1542 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1547 re_dfastate_t *newstate;
1549 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1550 if (BE (newstate == NULL, 0))
1552 err = re_node_set_init_copy (&newstate->nodes, nodes);
1553 if (BE (err != REG_NOERROR, 0))
1559 newstate->entrance_nodes = &newstate->nodes;
1560 for (i = 0 ; i < nodes->nelem ; i++)
1562 re_token_t *node = dfa->nodes + nodes->elems[i];
1563 re_token_type_t type = node->type;
1564 if (type == CHARACTER && !node->constraint)
1566 #ifdef RE_ENABLE_I18N
1567 newstate->accept_mb |= node->accept_mb;
1568 #endif /* RE_ENABLE_I18N */
1570 /* If the state has the halt node, the state is a halt state. */
1571 if (type == END_OF_RE)
1573 else if (type == OP_BACK_REF)
1574 newstate->has_backref = 1;
1575 else if (type == ANCHOR || node->constraint)
1576 newstate->has_constraint = 1;
1578 err = register_state (dfa, newstate, hash);
1579 if (BE (err != REG_NOERROR, 0))
1581 free_state (newstate);
1587 /* Create the new state which is depend on the context CONTEXT.
1588 Return the new state if succeeded, otherwise return NULL. */
1590 static re_dfastate_t *
1592 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1593 unsigned int context, re_hashval_t hash)
1595 Idx i, nctx_nodes = 0;
1597 re_dfastate_t *newstate;
1599 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1600 if (BE (newstate == NULL, 0))
1602 err = re_node_set_init_copy (&newstate->nodes, nodes);
1603 if (BE (err != REG_NOERROR, 0))
1609 newstate->context = context;
1610 newstate->entrance_nodes = &newstate->nodes;
1612 for (i = 0 ; i < nodes->nelem ; i++)
1614 unsigned int constraint = 0;
1615 re_token_t *node = dfa->nodes + nodes->elems[i];
1616 re_token_type_t type = node->type;
1617 if (node->constraint)
1618 constraint = node->constraint;
1620 if (type == CHARACTER && !constraint)
1622 #ifdef RE_ENABLE_I18N
1623 newstate->accept_mb |= node->accept_mb;
1624 #endif /* RE_ENABLE_I18N */
1626 /* If the state has the halt node, the state is a halt state. */
1627 if (type == END_OF_RE)
1629 else if (type == OP_BACK_REF)
1630 newstate->has_backref = 1;
1631 else if (type == ANCHOR)
1632 constraint = node->opr.ctx_type;
1636 if (newstate->entrance_nodes == &newstate->nodes)
1638 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1639 if (BE (newstate->entrance_nodes == NULL, 0))
1641 free_state (newstate);
1644 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1646 newstate->has_constraint = 1;
1649 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1651 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1656 err = register_state (dfa, newstate, hash);
1657 if (BE (err != REG_NOERROR, 0))
1659 free_state (newstate);