1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static void re_string_construct_common (const char *str, Idx len,
22 REG_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
30 re_hashval_t hash) internal_function;
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
63 /* This function allocate the buffers, and initialize them. */
67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
85 if (dfa->mb_cur_max > 1)
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
92 if (pstr->valid_raw_len >= len)
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
111 #endif /* RE_ENABLE_I18N */
114 re_string_translate_buffer (pstr);
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
135 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
136 if (BE (new_wcs == NULL, 0))
139 if (pstr->offsets != NULL)
141 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
142 if (BE (new_offsets == NULL, 0))
144 pstr->offsets = new_offsets;
147 #endif /* RE_ENABLE_I18N */
148 if (pstr->mbs_allocated)
150 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152 if (BE (new_mbs == NULL, 0))
156 pstr->bufs_len = new_buf_len;
163 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
164 REG_TRANSLATE_TYPE trans, bool icase,
167 pstr->raw_mbs = (const unsigned char *) str;
170 pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
172 pstr->mbs_allocated = (trans != NULL || icase);
173 pstr->mb_cur_max = dfa->mb_cur_max;
174 pstr->is_utf8 = dfa->is_utf8;
175 pstr->map_notascii = dfa->map_notascii;
176 pstr->stop = pstr->len;
177 pstr->raw_stop = pstr->stop;
180 #ifdef RE_ENABLE_I18N
182 /* Build wide character buffer PSTR->WCS.
183 If the byte sequence of the string are:
184 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185 Then wide character buffer will be:
186 <wc1> , WEOF , <wc2> , WEOF , <wc3>
187 We use WEOF for padding, they indicate that the position isn't
188 a first byte of a multibyte character.
190 Note that this function assumes PSTR->VALID_LEN elements are already
191 built and starts from PSTR->VALID_LEN. */
195 build_wcs_buffer (re_string_t *pstr)
198 unsigned char buf[MB_LEN_MAX];
199 assert (MB_LEN_MAX >= pstr->mb_cur_max);
201 unsigned char buf[64];
204 Idx byte_idx, end_idx, remain_len;
207 /* Build the buffers from pstr->valid_len to either pstr->len or
209 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
210 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
215 remain_len = end_idx - byte_idx;
216 prev_st = pstr->cur_state;
217 /* Apply the translation if we need. */
218 if (BE (pstr->trans != NULL, 0))
222 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
224 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
225 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
227 p = (const char *) buf;
230 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
231 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
232 if (BE (mbclen == (size_t) -2, 0))
234 /* The buffer doesn't have enough space, finish to build. */
235 pstr->cur_state = prev_st;
238 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
240 /* We treat these cases as a singlebyte character. */
242 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243 if (BE (pstr->trans != NULL, 0))
244 wc = pstr->trans[wc];
245 pstr->cur_state = prev_st;
248 /* Write wide character and padding. */
249 pstr->wcs[byte_idx++] = wc;
250 /* Write paddings. */
251 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
252 pstr->wcs[byte_idx++] = WEOF;
254 pstr->valid_len = byte_idx;
255 pstr->valid_raw_len = byte_idx;
258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
259 but for REG_ICASE. */
263 build_wcs_upper_buffer (re_string_t *pstr)
266 Idx src_idx, byte_idx, end_idx, remain_len;
269 char buf[MB_LEN_MAX];
270 assert (MB_LEN_MAX >= pstr->mb_cur_max);
275 byte_idx = pstr->valid_len;
276 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
278 /* The following optimization assumes that ASCII characters can be
279 mapped to wide characters with a simple cast. */
280 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
282 while (byte_idx < end_idx)
286 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
287 && mbsinit (&pstr->cur_state))
289 /* In case of a singlebyte character. */
291 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
292 /* The next step uses the assumption that wchar_t is encoded
293 ASCII-safe: all ASCII values can be converted like this. */
294 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
299 remain_len = end_idx - byte_idx;
300 prev_st = pstr->cur_state;
301 mbclen = mbrtowc (&wc,
302 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
303 + byte_idx), remain_len, &pstr->cur_state);
304 if (BE (mbclen + 2 > 2, 1))
312 mbcdlen = wcrtomb (buf, wcu, &prev_st);
313 if (BE (mbclen == mbcdlen, 1))
314 memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 memcpy (pstr->mbs + byte_idx,
323 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
324 pstr->wcs[byte_idx++] = wcu;
325 /* Write paddings. */
326 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
327 pstr->wcs[byte_idx++] = WEOF;
329 else if (mbclen == (size_t) -1 || mbclen == 0)
331 /* It is an invalid character or '\0'. Just use the byte. */
332 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
333 pstr->mbs[byte_idx] = ch;
334 /* And also cast it to wide char. */
335 pstr->wcs[byte_idx++] = (wchar_t) ch;
336 if (BE (mbclen == (size_t) -1, 0))
337 pstr->cur_state = prev_st;
341 /* The buffer doesn't have enough space, finish to build. */
342 pstr->cur_state = prev_st;
346 pstr->valid_len = byte_idx;
347 pstr->valid_raw_len = byte_idx;
351 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
356 remain_len = end_idx - byte_idx;
357 prev_st = pstr->cur_state;
358 if (BE (pstr->trans != NULL, 0))
362 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
364 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
365 buf[i] = pstr->trans[ch];
367 p = (const char *) buf;
370 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
371 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
372 if (BE (mbclen + 2 > 2, 1))
380 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
381 if (BE (mbclen == mbcdlen, 1))
382 memcpy (pstr->mbs + byte_idx, buf, mbclen);
383 else if (mbcdlen != (size_t) -1)
387 if (byte_idx + mbcdlen > pstr->bufs_len)
389 pstr->cur_state = prev_st;
393 if (pstr->offsets == NULL)
395 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
397 if (pstr->offsets == NULL)
400 if (!pstr->offsets_needed)
402 for (i = 0; i < (size_t) byte_idx; ++i)
403 pstr->offsets[i] = i;
404 pstr->offsets_needed = 1;
407 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
408 pstr->wcs[byte_idx] = wcu;
409 pstr->offsets[byte_idx] = src_idx;
410 for (i = 1; i < mbcdlen; ++i)
412 pstr->offsets[byte_idx + i]
413 = src_idx + (i < mbclen ? i : mbclen - 1);
414 pstr->wcs[byte_idx + i] = WEOF;
416 pstr->len += mbcdlen - mbclen;
417 if (pstr->raw_stop > src_idx)
418 pstr->stop += mbcdlen - mbclen;
419 end_idx = (pstr->bufs_len > pstr->len)
420 ? pstr->len : pstr->bufs_len;
426 memcpy (pstr->mbs + byte_idx, p, mbclen);
429 memcpy (pstr->mbs + byte_idx, p, mbclen);
431 if (BE (pstr->offsets_needed != 0, 0))
434 for (i = 0; i < mbclen; ++i)
435 pstr->offsets[byte_idx + i] = src_idx + i;
439 pstr->wcs[byte_idx++] = wcu;
440 /* Write paddings. */
441 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
442 pstr->wcs[byte_idx++] = WEOF;
444 else if (mbclen == (size_t) -1 || mbclen == 0)
446 /* It is an invalid character or '\0'. Just use the byte. */
447 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
449 if (BE (pstr->trans != NULL, 0))
450 ch = pstr->trans [ch];
451 pstr->mbs[byte_idx] = ch;
453 if (BE (pstr->offsets_needed != 0, 0))
454 pstr->offsets[byte_idx] = src_idx;
457 /* And also cast it to wide char. */
458 pstr->wcs[byte_idx++] = (wchar_t) ch;
459 if (BE (mbclen == (size_t) -1, 0))
460 pstr->cur_state = prev_st;
464 /* The buffer doesn't have enough space, finish to build. */
465 pstr->cur_state = prev_st;
469 pstr->valid_len = byte_idx;
470 pstr->valid_raw_len = src_idx;
474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
479 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
486 /* Skip the characters which are not necessary to check. */
487 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
488 rawbuf_idx < new_raw_idx;)
491 remain_len = pstr->len - rawbuf_idx;
492 prev_st = pstr->cur_state;
493 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
494 remain_len, &pstr->cur_state);
495 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
497 /* We treat these cases as a singlebyte character. */
499 pstr->cur_state = prev_st;
501 /* Then proceed the next character. */
502 rawbuf_idx += mbclen;
504 *last_wc = (wint_t) wc;
507 #endif /* RE_ENABLE_I18N */
509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
510 This function is used in case of REG_ICASE. */
514 build_upper_buffer (re_string_t *pstr)
516 Idx char_idx, end_idx;
517 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
519 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
521 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
522 if (BE (pstr->trans != NULL, 0))
523 ch = pstr->trans[ch];
525 pstr->mbs[char_idx] = toupper (ch);
527 pstr->mbs[char_idx] = ch;
529 pstr->valid_len = char_idx;
530 pstr->valid_raw_len = char_idx;
533 /* Apply TRANS to the buffer in PSTR. */
537 re_string_translate_buffer (re_string_t *pstr)
539 Idx buf_idx, end_idx;
540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
545 pstr->mbs[buf_idx] = pstr->trans[ch];
548 pstr->valid_len = buf_idx;
549 pstr->valid_raw_len = buf_idx;
552 /* This function re-construct the buffers.
553 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
554 convert to upper case in case of REG_ICASE, apply translation. */
558 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
562 if (BE (pstr->raw_mbs_idx <= idx, 0))
563 offset = idx - pstr->raw_mbs_idx;
567 #ifdef RE_ENABLE_I18N
568 if (pstr->mb_cur_max > 1)
569 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
570 #endif /* RE_ENABLE_I18N */
571 pstr->len = pstr->raw_len;
572 pstr->stop = pstr->raw_stop;
574 pstr->raw_mbs_idx = 0;
575 pstr->valid_raw_len = 0;
576 pstr->offsets_needed = 0;
577 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
578 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
579 if (!pstr->mbs_allocated)
580 pstr->mbs = (unsigned char *) pstr->raw_mbs;
584 if (BE (offset != 0, 1))
586 /* Are the characters which are already checked remain? */
587 if (BE (offset < pstr->valid_raw_len, 1)
588 #ifdef RE_ENABLE_I18N
589 /* Handling this would enlarge the code too much.
590 Accept a slowdown in that case. */
591 && pstr->offsets_needed == 0
595 /* Yes, move them to the front of the buffer. */
596 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
597 #ifdef RE_ENABLE_I18N
598 if (pstr->mb_cur_max > 1)
599 memmove (pstr->wcs, pstr->wcs + offset,
600 (pstr->valid_len - offset) * sizeof (wint_t));
601 #endif /* RE_ENABLE_I18N */
602 if (BE (pstr->mbs_allocated, 0))
603 memmove (pstr->mbs, pstr->mbs + offset,
604 pstr->valid_len - offset);
605 pstr->valid_len -= offset;
606 pstr->valid_raw_len -= offset;
608 assert (pstr->valid_len > 0);
613 /* No, skip all characters until IDX. */
614 #ifdef RE_ENABLE_I18N
615 if (BE (pstr->offsets_needed, 0))
617 pstr->len = pstr->raw_len - idx + offset;
618 pstr->stop = pstr->raw_stop - idx + offset;
619 pstr->offsets_needed = 0;
623 pstr->valid_raw_len = 0;
624 #ifdef RE_ENABLE_I18N
625 if (pstr->mb_cur_max > 1)
632 const unsigned char *raw, *p, *q, *end;
634 /* Special case UTF-8. Multi-byte chars start with any
635 byte other than 0x80 - 0xbf. */
636 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
637 end = raw + (offset - pstr->mb_cur_max);
638 for (p = raw + offset - 1; p >= end; --p)
639 if ((*p & 0xc0) != 0x80)
643 Idx mlen = raw + pstr->len - p;
644 unsigned char buf[6];
647 if (BE (pstr->trans != NULL, 0))
649 int i = mlen < 6 ? mlen : 6;
651 buf[i] = pstr->trans[p[i]];
654 /* XXX Don't use mbrtowc, we know which conversion
655 to use (UTF-8 -> UCS4). */
656 memset (&cur_state, 0, sizeof (cur_state));
657 mlen = (mbrtowc (&wc2, (const char *) p, mlen,
659 - (raw + offset - p));
662 memset (&pstr->cur_state, '\0',
664 pstr->valid_len = mlen;
672 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
673 if (BE (pstr->valid_len, 0))
675 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
676 pstr->wcs[wcs_idx] = WEOF;
677 if (pstr->mbs_allocated)
678 memset (pstr->mbs, 255, pstr->valid_len);
680 pstr->valid_raw_len = pstr->valid_len;
681 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
682 && IS_WIDE_WORD_CHAR (wc))
684 : ((IS_WIDE_NEWLINE (wc)
685 && pstr->newline_anchor)
686 ? CONTEXT_NEWLINE : 0));
689 #endif /* RE_ENABLE_I18N */
691 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
694 pstr->tip_context = (bitset_contain (pstr->word_char, c)
696 : ((IS_NEWLINE (c) && pstr->newline_anchor)
697 ? CONTEXT_NEWLINE : 0));
700 if (!BE (pstr->mbs_allocated, 0))
703 pstr->raw_mbs_idx = idx;
705 pstr->stop -= offset;
707 /* Then build the buffers. */
708 #ifdef RE_ENABLE_I18N
709 if (pstr->mb_cur_max > 1)
713 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
714 if (BE (ret != REG_NOERROR, 0))
718 build_wcs_buffer (pstr);
721 #endif /* RE_ENABLE_I18N */
722 if (BE (pstr->mbs_allocated, 0))
725 build_upper_buffer (pstr);
726 else if (pstr->trans != NULL)
727 re_string_translate_buffer (pstr);
730 pstr->valid_len = pstr->len;
737 internal_function __attribute ((pure))
738 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
743 /* Handle the common (easiest) cases first. */
744 if (BE (!pstr->mbs_allocated, 1))
745 return re_string_peek_byte (pstr, idx);
747 #ifdef RE_ENABLE_I18N
748 if (pstr->mb_cur_max > 1
749 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
750 return re_string_peek_byte (pstr, idx);
753 off = pstr->cur_idx + idx;
754 #ifdef RE_ENABLE_I18N
755 if (pstr->offsets_needed)
756 off = pstr->offsets[off];
759 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
761 #ifdef RE_ENABLE_I18N
762 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
763 this function returns CAPITAL LETTER I instead of first byte of
764 DOTLESS SMALL LETTER I. The latter would confuse the parser,
765 since peek_byte_case doesn't advance cur_idx in any way. */
766 if (pstr->offsets_needed && !isascii (ch))
767 return re_string_peek_byte (pstr, idx);
774 internal_function __attribute ((pure))
775 re_string_fetch_byte_case (re_string_t *pstr)
777 if (BE (!pstr->mbs_allocated, 1))
778 return re_string_fetch_byte (pstr);
780 #ifdef RE_ENABLE_I18N
781 if (pstr->offsets_needed)
786 /* For tr_TR.UTF-8 [[:islower:]] there is
787 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
788 in that case the whole multi-byte character and return
789 the original letter. On the other side, with
790 [[: DOTLESS SMALL LETTER I return [[:I, as doing
791 anything else would complicate things too much. */
793 if (!re_string_first_byte (pstr, pstr->cur_idx))
794 return re_string_fetch_byte (pstr);
796 off = pstr->offsets[pstr->cur_idx];
797 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
800 return re_string_fetch_byte (pstr);
802 re_string_skip_bytes (pstr,
803 re_string_char_size_at (pstr, pstr->cur_idx));
808 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
813 re_string_destruct (re_string_t *pstr)
815 #ifdef RE_ENABLE_I18N
817 re_free (pstr->offsets);
818 #endif /* RE_ENABLE_I18N */
819 if (pstr->mbs_allocated)
823 /* Return the context at IDX in INPUT. */
827 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
830 if (BE (! REG_VALID_INDEX (idx), 0))
831 /* In this case, we use the value stored in input->tip_context,
832 since we can't know the character in input->mbs[-1] here. */
833 return input->tip_context;
834 if (BE (idx == input->len, 0))
835 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
836 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
837 #ifdef RE_ENABLE_I18N
838 if (input->mb_cur_max > 1)
842 while(input->wcs[wc_idx] == WEOF)
845 /* It must not happen. */
846 assert (wc_idx >= 0);
850 return input->tip_context;
852 wc = input->wcs[wc_idx];
853 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
855 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
856 ? CONTEXT_NEWLINE : 0);
861 c = re_string_byte_at (input, idx);
862 if (bitset_contain (input->word_char, c))
864 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
868 /* Functions for set operation. */
872 re_node_set_alloc (re_node_set *set, Idx size)
876 set->elems = re_malloc (Idx, size);
877 if (BE (set->elems == NULL, 0))
884 re_node_set_init_1 (re_node_set *set, Idx elem)
888 set->elems = re_malloc (Idx, 1);
889 if (BE (set->elems == NULL, 0))
891 set->alloc = set->nelem = 0;
894 set->elems[0] = elem;
900 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
903 set->elems = re_malloc (Idx, 2);
904 if (BE (set->elems == NULL, 0))
909 set->elems[0] = elem1;
916 set->elems[0] = elem1;
917 set->elems[1] = elem2;
921 set->elems[0] = elem2;
922 set->elems[1] = elem1;
930 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
932 dest->nelem = src->nelem;
935 dest->alloc = dest->nelem;
936 dest->elems = re_malloc (Idx, dest->alloc);
937 if (BE (dest->elems == NULL, 0))
939 dest->alloc = dest->nelem = 0;
942 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
945 re_node_set_init_empty (dest);
949 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
950 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
951 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
955 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
956 const re_node_set *src2)
958 Idx i1, i2, is, id, delta, sbase;
959 if (src1->nelem == 0 || src2->nelem == 0)
962 /* We need dest->nelem + 2 * elems_in_intersection; this is a
963 conservative estimate. */
964 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
966 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
967 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
968 if (BE (new_elems == NULL, 0))
970 dest->elems = new_elems;
971 dest->alloc = new_alloc;
974 /* Find the items in the intersection of SRC1 and SRC2, and copy
975 into the top of DEST those that are not already in DEST itself. */
976 sbase = dest->nelem + src1->nelem + src2->nelem;
977 i1 = src1->nelem - 1;
978 i2 = src2->nelem - 1;
979 id = dest->nelem - 1;
982 if (src1->elems[i1] == src2->elems[i2])
984 /* Try to find the item in DEST. Maybe we could binary search? */
985 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
988 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
989 dest->elems[--sbase] = src1->elems[i1];
991 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
995 /* Lower the highest of the two items. */
996 else if (src1->elems[i1] < src2->elems[i2])
998 if (! REG_VALID_INDEX (--i2))
1003 if (! REG_VALID_INDEX (--i1))
1008 id = dest->nelem - 1;
1009 is = dest->nelem + src1->nelem + src2->nelem - 1;
1010 delta = is - sbase + 1;
1012 /* Now copy. When DELTA becomes zero, the remaining
1013 DEST elements are already in place; this is more or
1014 less the same loop that is in re_node_set_merge. */
1015 dest->nelem += delta;
1016 if (delta > 0 && REG_VALID_INDEX (id))
1019 if (dest->elems[is] > dest->elems[id])
1021 /* Copy from the top. */
1022 dest->elems[id + delta--] = dest->elems[is--];
1028 /* Slide from the bottom. */
1029 dest->elems[id + delta] = dest->elems[id];
1030 if (! REG_VALID_INDEX (--id))
1035 /* Copy remaining SRC elements. */
1036 memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1041 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1042 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1044 static reg_errcode_t
1046 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1047 const re_node_set *src2)
1050 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1052 dest->alloc = src1->nelem + src2->nelem;
1053 dest->elems = re_malloc (Idx, dest->alloc);
1054 if (BE (dest->elems == NULL, 0))
1059 if (src1 != NULL && src1->nelem > 0)
1060 return re_node_set_init_copy (dest, src1);
1061 else if (src2 != NULL && src2->nelem > 0)
1062 return re_node_set_init_copy (dest, src2);
1064 re_node_set_init_empty (dest);
1067 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1069 if (src1->elems[i1] > src2->elems[i2])
1071 dest->elems[id++] = src2->elems[i2++];
1074 if (src1->elems[i1] == src2->elems[i2])
1076 dest->elems[id++] = src1->elems[i1++];
1078 if (i1 < src1->nelem)
1080 memcpy (dest->elems + id, src1->elems + i1,
1081 (src1->nelem - i1) * sizeof dest->elems[0]);
1082 id += src1->nelem - i1;
1084 else if (i2 < src2->nelem)
1086 memcpy (dest->elems + id, src2->elems + i2,
1087 (src2->nelem - i2) * sizeof dest->elems[0]);
1088 id += src2->nelem - i2;
1094 /* Calculate the union set of the sets DEST and SRC. And store it to
1095 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1097 static reg_errcode_t
1099 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1101 Idx is, id, sbase, delta;
1102 if (src == NULL || src->nelem == 0)
1104 if (dest->alloc < 2 * src->nelem + dest->nelem)
1106 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1107 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1108 if (BE (new_buffer == NULL, 0))
1110 dest->elems = new_buffer;
1111 dest->alloc = new_alloc;
1114 if (BE (dest->nelem == 0, 0))
1116 dest->nelem = src->nelem;
1117 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1121 /* Copy into the top of DEST the items of SRC that are not
1122 found in DEST. Maybe we could binary search in DEST? */
1123 for (sbase = dest->nelem + 2 * src->nelem,
1124 is = src->nelem - 1, id = dest->nelem - 1;
1125 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1127 if (dest->elems[id] == src->elems[is])
1129 else if (dest->elems[id] < src->elems[is])
1130 dest->elems[--sbase] = src->elems[is--];
1131 else /* if (dest->elems[id] > src->elems[is]) */
1135 if (REG_VALID_INDEX (is))
1137 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1139 memcpy (dest->elems + sbase, src->elems,
1140 (is + 1) * sizeof dest->elems[0]);
1143 id = dest->nelem - 1;
1144 is = dest->nelem + 2 * src->nelem - 1;
1145 delta = is - sbase + 1;
1149 /* Now copy. When DELTA becomes zero, the remaining
1150 DEST elements are already in place. */
1151 dest->nelem += delta;
1154 if (dest->elems[is] > dest->elems[id])
1156 /* Copy from the top. */
1157 dest->elems[id + delta--] = dest->elems[is--];
1163 /* Slide from the bottom. */
1164 dest->elems[id + delta] = dest->elems[id];
1165 if (! REG_VALID_INDEX (--id))
1167 /* Copy remaining SRC elements. */
1168 memcpy (dest->elems, dest->elems + sbase,
1169 delta * sizeof dest->elems[0]);
1178 /* Insert the new element ELEM to the re_node_set* SET.
1179 SET should not already have ELEM.
1180 Return true if successful. */
1184 re_node_set_insert (re_node_set *set, Idx elem)
1187 /* In case the set is empty. */
1188 if (set->alloc == 0)
1189 return re_node_set_init_1 (set, elem) == REG_NOERROR;
1191 if (BE (set->nelem, 0) == 0)
1193 /* We already guaranteed above that set->alloc != 0. */
1194 set->elems[0] = elem;
1199 /* Realloc if we need. */
1200 if (set->alloc == set->nelem)
1203 set->alloc = set->alloc * 2;
1204 new_elems = re_realloc (set->elems, Idx, set->alloc);
1205 if (BE (new_elems == NULL, 0))
1207 set->elems = new_elems;
1210 /* Move the elements which follows the new element. Test the
1211 first element separately to skip a check in the inner loop. */
1212 if (elem < set->elems[0])
1215 for (idx = set->nelem; idx > 0; idx--)
1216 set->elems[idx] = set->elems[idx - 1];
1220 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1221 set->elems[idx] = set->elems[idx - 1];
1224 /* Insert the new element. */
1225 set->elems[idx] = elem;
1230 /* Insert the new element ELEM to the re_node_set* SET.
1231 SET should not already have any element greater than or equal to ELEM.
1232 Return true if successful. */
1236 re_node_set_insert_last (re_node_set *set, Idx elem)
1238 /* Realloc if we need. */
1239 if (set->alloc == set->nelem)
1242 set->alloc = (set->alloc + 1) * 2;
1243 new_elems = re_realloc (set->elems, Idx, set->alloc);
1244 if (BE (new_elems == NULL, 0))
1246 set->elems = new_elems;
1249 /* Insert the new element. */
1250 set->elems[set->nelem++] = elem;
1254 /* Compare two node sets SET1 and SET2.
1255 Return true if SET1 and SET2 are equivalent. */
1258 internal_function __attribute ((pure))
1259 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1262 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1264 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1265 if (set1->elems[i] != set2->elems[i])
1270 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1273 internal_function __attribute ((pure))
1274 re_node_set_contains (const re_node_set *set, Idx elem)
1276 __re_size_t idx, right, mid;
1277 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1280 /* Binary search the element. */
1282 right = set->nelem - 1;
1285 mid = (idx + right) / 2;
1286 if (set->elems[mid] < elem)
1291 return set->elems[idx] == elem ? idx + 1 : 0;
1296 re_node_set_remove_at (re_node_set *set, Idx idx)
1298 if (idx < 0 || idx >= set->nelem)
1301 for (; idx < set->nelem; idx++)
1302 set->elems[idx] = set->elems[idx + 1];
1306 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1307 Or return REG_MISSING if an error occurred. */
1311 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1313 int type = token.type;
1314 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1316 Idx new_nodes_alloc = dfa->nodes_alloc * 2;
1317 Idx *new_nexts, *new_indices;
1318 re_node_set *new_edests, *new_eclosures;
1320 re_token_t *new_nodes = re_realloc (dfa->nodes, re_token_t,
1322 if (BE (new_nodes == NULL, 0))
1324 dfa->nodes = new_nodes;
1325 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1326 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1327 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1328 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1329 if (BE (new_nexts == NULL || new_indices == NULL
1330 || new_edests == NULL || new_eclosures == NULL, 0))
1332 dfa->nexts = new_nexts;
1333 dfa->org_indices = new_indices;
1334 dfa->edests = new_edests;
1335 dfa->eclosures = new_eclosures;
1336 dfa->nodes_alloc = new_nodes_alloc;
1338 dfa->nodes[dfa->nodes_len] = token;
1339 dfa->nodes[dfa->nodes_len].constraint = 0;
1340 #ifdef RE_ENABLE_I18N
1341 dfa->nodes[dfa->nodes_len].accept_mb =
1342 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1344 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1345 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1346 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1347 return dfa->nodes_len++;
1350 static inline re_hashval_t
1352 calc_state_hash (const re_node_set *nodes, unsigned int context)
1354 re_hashval_t hash = nodes->nelem + context;
1356 for (i = 0 ; i < nodes->nelem ; i++)
1357 hash += nodes->elems[i];
1361 /* Search for the state whose node_set is equivalent to NODES.
1362 Return the pointer to the state, if we found it in the DFA.
1363 Otherwise create the new one and return it. In case of an error
1364 return NULL and set the error code in ERR.
1365 Note: - We assume NULL as the invalid state, then it is possible that
1366 return value is NULL and ERR is REG_NOERROR.
1367 - We never return non-NULL value in case of any errors, it is for
1370 static re_dfastate_t*
1372 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1375 re_dfastate_t *new_state;
1376 struct re_state_table_entry *spot;
1379 /* Suppress bogus uninitialized-variable warnings. */
1382 if (BE (nodes->nelem == 0, 0))
1387 hash = calc_state_hash (nodes, 0);
1388 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1390 for (i = 0 ; i < spot->num ; i++)
1392 re_dfastate_t *state = spot->array[i];
1393 if (hash != state->hash)
1395 if (re_node_set_compare (&state->nodes, nodes))
1399 /* There are no appropriate state in the dfa, create the new one. */
1400 new_state = create_ci_newstate (dfa, nodes, hash);
1401 if (BE (new_state != NULL, 1))
1410 /* Search for the state whose node_set is equivalent to NODES and
1411 whose context is equivalent to CONTEXT.
1412 Return the pointer to the state, if we found it in the DFA.
1413 Otherwise create the new one and return it. In case of an error
1414 return NULL and set the error code in ERR.
1415 Note: - We assume NULL as the invalid state, then it is possible that
1416 return value is NULL and ERR is REG_NOERROR.
1417 - We never return non-NULL value in case of any errors, it is for
1420 static re_dfastate_t*
1422 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1423 const re_node_set *nodes, unsigned int context)
1426 re_dfastate_t *new_state;
1427 struct re_state_table_entry *spot;
1430 /* Suppress bogus uninitialized-variable warnings. */
1433 if (nodes->nelem == 0)
1438 hash = calc_state_hash (nodes, context);
1439 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1441 for (i = 0 ; i < spot->num ; i++)
1443 re_dfastate_t *state = spot->array[i];
1444 if (state->hash == hash
1445 && state->context == context
1446 && re_node_set_compare (state->entrance_nodes, nodes))
1449 /* There are no appropriate state in `dfa', create the new one. */
1450 new_state = create_cd_newstate (dfa, nodes, context, hash);
1451 if (BE (new_state != NULL, 1))
1460 /* Finish initialization of the new state NEWSTATE, and using its hash value
1461 HASH put in the appropriate bucket of DFA's state table. Return value
1462 indicates the error code if failed. */
1464 static reg_errcode_t
1466 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1468 struct re_state_table_entry *spot;
1472 newstate->hash = hash;
1473 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1474 if (BE (err != REG_NOERROR, 0))
1476 for (i = 0; i < newstate->nodes.nelem; i++)
1478 Idx elem = newstate->nodes.elems[i];
1479 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1481 bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1487 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1488 if (BE (spot->alloc <= spot->num, 0))
1490 Idx new_alloc = 2 * spot->num + 2;
1491 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1493 if (BE (new_array == NULL, 0))
1495 spot->array = new_array;
1496 spot->alloc = new_alloc;
1498 spot->array[spot->num++] = newstate;
1502 /* Create the new state which is independ of contexts.
1503 Return the new state if succeeded, otherwise return NULL. */
1505 static re_dfastate_t *
1507 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1512 re_dfastate_t *newstate;
1514 newstate = re_calloc (re_dfastate_t, 1);
1515 if (BE (newstate == NULL, 0))
1517 err = re_node_set_init_copy (&newstate->nodes, nodes);
1518 if (BE (err != REG_NOERROR, 0))
1524 newstate->entrance_nodes = &newstate->nodes;
1525 for (i = 0 ; i < nodes->nelem ; i++)
1527 re_token_t *node = dfa->nodes + nodes->elems[i];
1528 re_token_type_t type = node->type;
1529 if (type == CHARACTER && !node->constraint)
1531 #ifdef RE_ENABLE_I18N
1532 newstate->accept_mb |= node->accept_mb;
1533 #endif /* RE_ENABLE_I18N */
1535 /* If the state has the halt node, the state is a halt state. */
1536 if (type == END_OF_RE)
1538 else if (type == OP_BACK_REF)
1539 newstate->has_backref = 1;
1540 else if (type == ANCHOR || node->constraint)
1541 newstate->has_constraint = 1;
1543 err = register_state (dfa, newstate, hash);
1544 if (BE (err != REG_NOERROR, 0))
1546 free_state (newstate);
1552 /* Create the new state which is depend on the context CONTEXT.
1553 Return the new state if succeeded, otherwise return NULL. */
1555 static re_dfastate_t *
1557 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1558 unsigned int context, re_hashval_t hash)
1560 Idx i, nctx_nodes = 0;
1562 re_dfastate_t *newstate;
1564 newstate = re_calloc (re_dfastate_t, 1);
1565 if (BE (newstate == NULL, 0))
1567 err = re_node_set_init_copy (&newstate->nodes, nodes);
1568 if (BE (err != REG_NOERROR, 0))
1574 newstate->context = context;
1575 newstate->entrance_nodes = &newstate->nodes;
1577 for (i = 0 ; i < nodes->nelem ; i++)
1579 unsigned int constraint = 0;
1580 re_token_t *node = dfa->nodes + nodes->elems[i];
1581 re_token_type_t type = node->type;
1582 if (node->constraint)
1583 constraint = node->constraint;
1585 if (type == CHARACTER && !constraint)
1587 #ifdef RE_ENABLE_I18N
1588 newstate->accept_mb |= node->accept_mb;
1589 #endif /* RE_ENABLE_I18N */
1591 /* If the state has the halt node, the state is a halt state. */
1592 if (type == END_OF_RE)
1594 else if (type == OP_BACK_REF)
1595 newstate->has_backref = 1;
1596 else if (type == ANCHOR)
1597 constraint = node->opr.ctx_type;
1601 if (newstate->entrance_nodes == &newstate->nodes)
1603 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1604 if (BE (newstate->entrance_nodes == NULL, 0))
1606 free_state (newstate);
1609 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1611 newstate->has_constraint = 1;
1614 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1616 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1621 err = register_state (dfa, newstate, hash);
1622 if (BE (err != REG_NOERROR, 0))
1624 free_state (newstate);
1632 free_state (re_dfastate_t *state)
1634 re_node_set_free (&state->non_eps_nodes);
1635 re_node_set_free (&state->inveclosure);
1636 if (state->entrance_nodes != &state->nodes)
1638 re_node_set_free (state->entrance_nodes);
1639 re_free (state->entrance_nodes);
1641 re_node_set_free (&state->nodes);
1642 re_free (state->word_trtable);
1643 re_free (state->trtable);