1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static void re_string_construct_common (const char *str, Idx len,
22 REG_TRANSLATE_TYPE trans, int icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
30 re_hashval_t hash) internal_function;
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 REG_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
63 /* This function allocate the buffers, and initialize them. */
67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 REG_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
85 if (dfa->mb_cur_max > 1)
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
92 if (pstr->valid_raw_len >= len)
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
111 #endif /* RE_ENABLE_I18N */
114 re_string_translate_buffer (pstr);
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
135 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
136 if (BE (new_wcs == NULL, 0))
139 if (pstr->offsets != NULL)
141 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
142 if (BE (new_offsets == NULL, 0))
144 pstr->offsets = new_offsets;
147 #endif /* RE_ENABLE_I18N */
148 if (pstr->mbs_allocated)
150 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152 if (BE (new_mbs == NULL, 0))
156 pstr->bufs_len = new_buf_len;
163 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
164 REG_TRANSLATE_TYPE trans, int icase,
167 pstr->raw_mbs = (const unsigned char *) str;
170 pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
171 pstr->icase = icase ? 1 : 0;
172 pstr->mbs_allocated = (trans != NULL || icase);
173 pstr->mb_cur_max = dfa->mb_cur_max;
174 pstr->is_utf8 = dfa->is_utf8;
175 pstr->map_notascii = dfa->map_notascii;
176 pstr->stop = pstr->len;
177 pstr->raw_stop = pstr->stop;
180 #ifdef RE_ENABLE_I18N
182 /* Build wide character buffer PSTR->WCS.
183 If the byte sequence of the string are:
184 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185 Then wide character buffer will be:
186 <wc1> , WEOF , <wc2> , WEOF , <wc3>
187 We use WEOF for padding, they indicate that the position isn't
188 a first byte of a multibyte character.
190 Note that this function assumes PSTR->VALID_LEN elements are already
191 built and starts from PSTR->VALID_LEN. */
195 build_wcs_buffer (re_string_t *pstr)
198 unsigned char buf[MB_LEN_MAX];
199 assert (MB_LEN_MAX >= pstr->mb_cur_max);
201 unsigned char buf[64];
204 Idx byte_idx, end_idx, remain_len;
207 /* Build the buffers from pstr->valid_len to either pstr->len or
209 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
210 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
215 remain_len = end_idx - byte_idx;
216 prev_st = pstr->cur_state;
217 /* Apply the translation if we need. */
218 if (BE (pstr->trans != NULL, 0))
222 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
224 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
225 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
227 p = (const char *) buf;
230 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
231 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
232 if (BE (mbclen == (size_t) -2, 0))
234 /* The buffer doesn't have enough space, finish to build. */
235 pstr->cur_state = prev_st;
238 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
240 /* We treat these cases as a singlebyte character. */
242 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243 if (BE (pstr->trans != NULL, 0))
244 wc = pstr->trans[wc];
245 pstr->cur_state = prev_st;
248 /* Write wide character and padding. */
249 pstr->wcs[byte_idx++] = wc;
250 /* Write paddings. */
251 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
252 pstr->wcs[byte_idx++] = WEOF;
254 pstr->valid_len = byte_idx;
255 pstr->valid_raw_len = byte_idx;
258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
259 but for REG_ICASE. */
263 build_wcs_upper_buffer (re_string_t *pstr)
266 Idx src_idx, byte_idx, end_idx, remain_len;
269 char buf[MB_LEN_MAX];
270 assert (MB_LEN_MAX >= pstr->mb_cur_max);
275 byte_idx = pstr->valid_len;
276 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
278 /* The following optimization assumes that ASCII characters can be
279 mapped to wide characters with a simple cast. */
280 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
282 while (byte_idx < end_idx)
286 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
287 && mbsinit (&pstr->cur_state))
289 /* In case of a singlebyte character. */
291 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
292 /* The next step uses the assumption that wchar_t is encoded
293 ASCII-safe: all ASCII values can be converted like this. */
294 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
299 remain_len = end_idx - byte_idx;
300 prev_st = pstr->cur_state;
301 mbclen = mbrtowc (&wc,
302 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
303 + byte_idx), remain_len, &pstr->cur_state);
304 if (BE (mbclen + 2 > 2, 1))
312 mbcdlen = wcrtomb (buf, wcu, &prev_st);
313 if (BE (mbclen == mbcdlen, 1))
314 memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 memcpy (pstr->mbs + byte_idx,
323 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
324 pstr->wcs[byte_idx++] = wcu;
325 /* Write paddings. */
326 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
327 pstr->wcs[byte_idx++] = WEOF;
329 else if (mbclen == (size_t) -1 || mbclen == 0)
331 /* It is an invalid character or '\0'. Just use the byte. */
332 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
333 pstr->mbs[byte_idx] = ch;
334 /* And also cast it to wide char. */
335 pstr->wcs[byte_idx++] = (wchar_t) ch;
336 if (BE (mbclen == (size_t) -1, 0))
337 pstr->cur_state = prev_st;
341 /* The buffer doesn't have enough space, finish to build. */
342 pstr->cur_state = prev_st;
346 pstr->valid_len = byte_idx;
347 pstr->valid_raw_len = byte_idx;
351 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
356 remain_len = end_idx - byte_idx;
357 prev_st = pstr->cur_state;
358 if (BE (pstr->trans != NULL, 0))
362 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
364 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
365 buf[i] = pstr->trans[ch];
367 p = (const char *) buf;
370 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
371 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
372 if (BE (mbclen + 2 > 2, 1))
380 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
381 if (BE (mbclen == mbcdlen, 1))
382 memcpy (pstr->mbs + byte_idx, buf, mbclen);
383 else if (mbcdlen != (size_t) -1)
387 if (byte_idx + mbcdlen > pstr->bufs_len)
389 pstr->cur_state = prev_st;
393 if (pstr->offsets == NULL)
395 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
397 if (pstr->offsets == NULL)
400 if (!pstr->offsets_needed)
402 for (i = 0; i < (size_t) byte_idx; ++i)
403 pstr->offsets[i] = i;
404 pstr->offsets_needed = 1;
407 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
408 pstr->wcs[byte_idx] = wcu;
409 pstr->offsets[byte_idx] = src_idx;
410 for (i = 1; i < mbcdlen; ++i)
412 pstr->offsets[byte_idx + i]
413 = src_idx + (i < mbclen ? i : mbclen - 1);
414 pstr->wcs[byte_idx + i] = WEOF;
416 pstr->len += mbcdlen - mbclen;
417 if (pstr->raw_stop > src_idx)
418 pstr->stop += mbcdlen - mbclen;
419 end_idx = (pstr->bufs_len > pstr->len)
420 ? pstr->len : pstr->bufs_len;
426 memcpy (pstr->mbs + byte_idx, p, mbclen);
429 memcpy (pstr->mbs + byte_idx, p, mbclen);
431 if (BE (pstr->offsets_needed != 0, 0))
434 for (i = 0; i < mbclen; ++i)
435 pstr->offsets[byte_idx + i] = src_idx + i;
439 pstr->wcs[byte_idx++] = wcu;
440 /* Write paddings. */
441 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
442 pstr->wcs[byte_idx++] = WEOF;
444 else if (mbclen == (size_t) -1 || mbclen == 0)
446 /* It is an invalid character or '\0'. Just use the byte. */
447 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
449 if (BE (pstr->trans != NULL, 0))
450 ch = pstr->trans [ch];
451 pstr->mbs[byte_idx] = ch;
453 if (BE (pstr->offsets_needed != 0, 0))
454 pstr->offsets[byte_idx] = src_idx;
457 /* And also cast it to wide char. */
458 pstr->wcs[byte_idx++] = (wchar_t) ch;
459 if (BE (mbclen == (size_t) -1, 0))
460 pstr->cur_state = prev_st;
464 /* The buffer doesn't have enough space, finish to build. */
465 pstr->cur_state = prev_st;
469 pstr->valid_len = byte_idx;
470 pstr->valid_raw_len = src_idx;
474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
479 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
486 /* Skip the characters which are not necessary to check. */
487 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
488 rawbuf_idx < new_raw_idx;)
491 remain_len = pstr->len - rawbuf_idx;
492 prev_st = pstr->cur_state;
493 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
494 remain_len, &pstr->cur_state);
495 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
497 /* We treat these cases as a singlebyte character. */
499 pstr->cur_state = prev_st;
501 /* Then proceed the next character. */
502 rawbuf_idx += mbclen;
504 *last_wc = (wint_t) wc;
507 #endif /* RE_ENABLE_I18N */
509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
510 This function is used in case of REG_ICASE. */
514 build_upper_buffer (re_string_t *pstr)
516 Idx char_idx, end_idx;
517 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
519 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
521 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
522 if (BE (pstr->trans != NULL, 0))
523 ch = pstr->trans[ch];
525 pstr->mbs[char_idx] = toupper (ch);
527 pstr->mbs[char_idx] = ch;
529 pstr->valid_len = char_idx;
530 pstr->valid_raw_len = char_idx;
533 /* Apply TRANS to the buffer in PSTR. */
537 re_string_translate_buffer (re_string_t *pstr)
539 Idx buf_idx, end_idx;
540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
545 pstr->mbs[buf_idx] = pstr->trans[ch];
548 pstr->valid_len = buf_idx;
549 pstr->valid_raw_len = buf_idx;
552 /* This function re-construct the buffers.
553 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
554 convert to upper case in case of REG_ICASE, apply translation. */
558 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
560 regoff_t offset = (regoff_t) idx - (regoff_t) pstr->raw_mbs_idx;
561 if (BE (offset < 0, 0))
564 #ifdef RE_ENABLE_I18N
565 if (pstr->mb_cur_max > 1)
566 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
567 #endif /* RE_ENABLE_I18N */
568 pstr->len = pstr->raw_len;
569 pstr->stop = pstr->raw_stop;
571 pstr->raw_mbs_idx = 0;
572 pstr->valid_raw_len = 0;
573 pstr->offsets_needed = 0;
574 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
575 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
576 if (!pstr->mbs_allocated)
577 pstr->mbs = (unsigned char *) pstr->raw_mbs;
581 if (BE (offset != 0, 1))
583 /* Are the characters which are already checked remain? */
584 if (BE (offset < pstr->valid_raw_len, 1)
585 #ifdef RE_ENABLE_I18N
586 /* Handling this would enlarge the code too much.
587 Accept a slowdown in that case. */
588 && pstr->offsets_needed == 0
592 /* Yes, move them to the front of the buffer. */
593 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
594 #ifdef RE_ENABLE_I18N
595 if (pstr->mb_cur_max > 1)
596 memmove (pstr->wcs, pstr->wcs + offset,
597 (pstr->valid_len - offset) * sizeof (wint_t));
598 #endif /* RE_ENABLE_I18N */
599 if (BE (pstr->mbs_allocated, 0))
600 memmove (pstr->mbs, pstr->mbs + offset,
601 pstr->valid_len - offset);
602 pstr->valid_len -= offset;
603 pstr->valid_raw_len -= offset;
605 assert (pstr->valid_len > 0);
610 /* No, skip all characters until IDX. */
611 #ifdef RE_ENABLE_I18N
612 if (BE (pstr->offsets_needed, 0))
614 pstr->len = pstr->raw_len - idx + offset;
615 pstr->stop = pstr->raw_stop - idx + offset;
616 pstr->offsets_needed = 0;
620 pstr->valid_raw_len = 0;
621 #ifdef RE_ENABLE_I18N
622 if (pstr->mb_cur_max > 1)
629 const unsigned char *raw, *p, *q, *end;
631 /* Special case UTF-8. Multi-byte chars start with any
632 byte other than 0x80 - 0xbf. */
633 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
634 end = raw + (offset - pstr->mb_cur_max);
635 for (p = raw + offset - 1; p >= end; --p)
636 if ((*p & 0xc0) != 0x80)
640 Idx mlen = raw + pstr->len - p;
641 unsigned char buf[6];
644 if (BE (pstr->trans != NULL, 0))
646 int i = mlen < 6 ? mlen : 6;
648 buf[i] = pstr->trans[p[i]];
651 /* XXX Don't use mbrtowc, we know which conversion
652 to use (UTF-8 -> UCS4). */
653 memset (&cur_state, 0, sizeof (cur_state));
654 mlen = (mbrtowc (&wc2, (const char *) p, mlen,
656 - (raw + offset - p));
659 memset (&pstr->cur_state, '\0',
661 pstr->valid_len = mlen;
669 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
670 if (BE (pstr->valid_len, 0))
672 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
673 pstr->wcs[wcs_idx] = WEOF;
674 if (pstr->mbs_allocated)
675 memset (pstr->mbs, 255, pstr->valid_len);
677 pstr->valid_raw_len = pstr->valid_len;
678 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
679 && IS_WIDE_WORD_CHAR (wc))
681 : ((IS_WIDE_NEWLINE (wc)
682 && pstr->newline_anchor)
683 ? CONTEXT_NEWLINE : 0));
686 #endif /* RE_ENABLE_I18N */
688 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
691 pstr->tip_context = (bitset_contain (pstr->word_char, c)
693 : ((IS_NEWLINE (c) && pstr->newline_anchor)
694 ? CONTEXT_NEWLINE : 0));
697 if (!BE (pstr->mbs_allocated, 0))
700 pstr->raw_mbs_idx = idx;
702 pstr->stop -= offset;
704 /* Then build the buffers. */
705 #ifdef RE_ENABLE_I18N
706 if (pstr->mb_cur_max > 1)
710 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
711 if (BE (ret != REG_NOERROR, 0))
715 build_wcs_buffer (pstr);
718 #endif /* RE_ENABLE_I18N */
719 if (BE (pstr->mbs_allocated, 0))
722 build_upper_buffer (pstr);
723 else if (pstr->trans != NULL)
724 re_string_translate_buffer (pstr);
727 pstr->valid_len = pstr->len;
734 internal_function __attribute ((pure))
735 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
740 /* Handle the common (easiest) cases first. */
741 if (BE (!pstr->mbs_allocated, 1))
742 return re_string_peek_byte (pstr, idx);
744 #ifdef RE_ENABLE_I18N
745 if (pstr->mb_cur_max > 1
746 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
747 return re_string_peek_byte (pstr, idx);
750 off = pstr->cur_idx + idx;
751 #ifdef RE_ENABLE_I18N
752 if (pstr->offsets_needed)
753 off = pstr->offsets[off];
756 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
758 #ifdef RE_ENABLE_I18N
759 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
760 this function returns CAPITAL LETTER I instead of first byte of
761 DOTLESS SMALL LETTER I. The latter would confuse the parser,
762 since peek_byte_case doesn't advance cur_idx in any way. */
763 if (pstr->offsets_needed && !isascii (ch))
764 return re_string_peek_byte (pstr, idx);
771 internal_function __attribute ((pure))
772 re_string_fetch_byte_case (re_string_t *pstr)
774 if (BE (!pstr->mbs_allocated, 1))
775 return re_string_fetch_byte (pstr);
777 #ifdef RE_ENABLE_I18N
778 if (pstr->offsets_needed)
783 /* For tr_TR.UTF-8 [[:islower:]] there is
784 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
785 in that case the whole multi-byte character and return
786 the original letter. On the other side, with
787 [[: DOTLESS SMALL LETTER I return [[:I, as doing
788 anything else would complicate things too much. */
790 if (!re_string_first_byte (pstr, pstr->cur_idx))
791 return re_string_fetch_byte (pstr);
793 off = pstr->offsets[pstr->cur_idx];
794 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
797 return re_string_fetch_byte (pstr);
799 re_string_skip_bytes (pstr,
800 re_string_char_size_at (pstr, pstr->cur_idx));
805 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
810 re_string_destruct (re_string_t *pstr)
812 #ifdef RE_ENABLE_I18N
814 re_free (pstr->offsets);
815 #endif /* RE_ENABLE_I18N */
816 if (pstr->mbs_allocated)
820 /* Return the context at IDX in INPUT. */
824 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
827 if (BE (! REG_VALID_INDEX (idx), 0))
828 /* In this case, we use the value stored in input->tip_context,
829 since we can't know the character in input->mbs[-1] here. */
830 return input->tip_context;
831 if (BE (idx == input->len, 0))
832 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
833 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
834 #ifdef RE_ENABLE_I18N
835 if (input->mb_cur_max > 1)
839 while(input->wcs[wc_idx] == WEOF)
842 /* It must not happen. */
843 assert (wc_idx >= 0);
847 return input->tip_context;
849 wc = input->wcs[wc_idx];
850 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
852 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
853 ? CONTEXT_NEWLINE : 0);
858 c = re_string_byte_at (input, idx);
859 if (bitset_contain (input->word_char, c))
861 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
865 /* Functions for set operation. */
869 re_node_set_alloc (re_node_set *set, Idx size)
873 set->elems = re_malloc (Idx, size);
874 if (BE (set->elems == NULL, 0))
881 re_node_set_init_1 (re_node_set *set, Idx elem)
885 set->elems = re_malloc (Idx, 1);
886 if (BE (set->elems == NULL, 0))
888 set->alloc = set->nelem = 0;
891 set->elems[0] = elem;
897 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
900 set->elems = re_malloc (Idx, 2);
901 if (BE (set->elems == NULL, 0))
906 set->elems[0] = elem1;
913 set->elems[0] = elem1;
914 set->elems[1] = elem2;
918 set->elems[0] = elem2;
919 set->elems[1] = elem1;
927 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
929 dest->nelem = src->nelem;
932 dest->alloc = dest->nelem;
933 dest->elems = re_malloc (Idx, dest->alloc);
934 if (BE (dest->elems == NULL, 0))
936 dest->alloc = dest->nelem = 0;
939 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
942 re_node_set_init_empty (dest);
946 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
947 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
948 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
952 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
953 const re_node_set *src2)
955 Idx i1, i2, is, id, delta, sbase;
956 if (src1->nelem == 0 || src2->nelem == 0)
959 /* We need dest->nelem + 2 * elems_in_intersection; this is a
960 conservative estimate. */
961 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
963 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
964 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
965 if (BE (new_elems == NULL, 0))
967 dest->elems = new_elems;
968 dest->alloc = new_alloc;
971 /* Find the items in the intersection of SRC1 and SRC2, and copy
972 into the top of DEST those that are not already in DEST itself. */
973 sbase = dest->nelem + src1->nelem + src2->nelem;
974 i1 = src1->nelem - 1;
975 i2 = src2->nelem - 1;
976 id = dest->nelem - 1;
979 if (src1->elems[i1] == src2->elems[i2])
981 /* Try to find the item in DEST. Maybe we could binary search? */
982 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
985 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
986 dest->elems[--sbase] = src1->elems[i1];
988 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
992 /* Lower the highest of the two items. */
993 else if (src1->elems[i1] < src2->elems[i2])
995 if (! REG_VALID_INDEX (--i2))
1000 if (! REG_VALID_INDEX (--i1))
1005 id = dest->nelem - 1;
1006 is = dest->nelem + src1->nelem + src2->nelem - 1;
1007 delta = is - sbase + 1;
1009 /* Now copy. When DELTA becomes zero, the remaining
1010 DEST elements are already in place; this is more or
1011 less the same loop that is in re_node_set_merge. */
1012 dest->nelem += delta;
1013 if (delta > 0 && REG_VALID_INDEX (id))
1016 if (dest->elems[is] > dest->elems[id])
1018 /* Copy from the top. */
1019 dest->elems[id + delta--] = dest->elems[is--];
1025 /* Slide from the bottom. */
1026 dest->elems[id + delta] = dest->elems[id];
1027 if (! REG_VALID_INDEX (--id))
1032 /* Copy remaining SRC elements. */
1033 memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1038 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1039 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1041 static reg_errcode_t
1043 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1044 const re_node_set *src2)
1047 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1049 dest->alloc = src1->nelem + src2->nelem;
1050 dest->elems = re_malloc (Idx, dest->alloc);
1051 if (BE (dest->elems == NULL, 0))
1056 if (src1 != NULL && src1->nelem > 0)
1057 return re_node_set_init_copy (dest, src1);
1058 else if (src2 != NULL && src2->nelem > 0)
1059 return re_node_set_init_copy (dest, src2);
1061 re_node_set_init_empty (dest);
1064 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1066 if (src1->elems[i1] > src2->elems[i2])
1068 dest->elems[id++] = src2->elems[i2++];
1071 if (src1->elems[i1] == src2->elems[i2])
1073 dest->elems[id++] = src1->elems[i1++];
1075 if (i1 < src1->nelem)
1077 memcpy (dest->elems + id, src1->elems + i1,
1078 (src1->nelem - i1) * sizeof dest->elems[0]);
1079 id += src1->nelem - i1;
1081 else if (i2 < src2->nelem)
1083 memcpy (dest->elems + id, src2->elems + i2,
1084 (src2->nelem - i2) * sizeof dest->elems[0]);
1085 id += src2->nelem - i2;
1091 /* Calculate the union set of the sets DEST and SRC. And store it to
1092 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1094 static reg_errcode_t
1096 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1098 Idx is, id, sbase, delta;
1099 if (src == NULL || src->nelem == 0)
1101 if (dest->alloc < 2 * src->nelem + dest->nelem)
1103 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1104 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1105 if (BE (new_buffer == NULL, 0))
1107 dest->elems = new_buffer;
1108 dest->alloc = new_alloc;
1111 if (BE (dest->nelem == 0, 0))
1113 dest->nelem = src->nelem;
1114 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1118 /* Copy into the top of DEST the items of SRC that are not
1119 found in DEST. Maybe we could binary search in DEST? */
1120 for (sbase = dest->nelem + 2 * src->nelem,
1121 is = src->nelem - 1, id = dest->nelem - 1;
1122 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1124 if (dest->elems[id] == src->elems[is])
1126 else if (dest->elems[id] < src->elems[is])
1127 dest->elems[--sbase] = src->elems[is--];
1128 else /* if (dest->elems[id] > src->elems[is]) */
1132 if (REG_VALID_INDEX (is))
1134 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1136 memcpy (dest->elems + sbase, src->elems,
1137 (is + 1) * sizeof dest->elems[0]);
1140 id = dest->nelem - 1;
1141 is = dest->nelem + 2 * src->nelem - 1;
1142 delta = is - sbase + 1;
1146 /* Now copy. When DELTA becomes zero, the remaining
1147 DEST elements are already in place. */
1148 dest->nelem += delta;
1151 if (dest->elems[is] > dest->elems[id])
1153 /* Copy from the top. */
1154 dest->elems[id + delta--] = dest->elems[is--];
1160 /* Slide from the bottom. */
1161 dest->elems[id + delta] = dest->elems[id];
1162 if (! REG_VALID_INDEX (--id))
1164 /* Copy remaining SRC elements. */
1165 memcpy (dest->elems, dest->elems + sbase,
1166 delta * sizeof dest->elems[0]);
1175 /* Insert the new element ELEM to the re_node_set* SET.
1176 SET should not already have ELEM.
1177 return -1 if an error is occured, return 1 otherwise. */
1181 re_node_set_insert (re_node_set *set, Idx elem)
1184 /* In case the set is empty. */
1185 if (set->alloc == 0)
1187 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1193 if (BE (set->nelem, 0) == 0)
1195 /* We already guaranteed above that set->alloc != 0. */
1196 set->elems[0] = elem;
1201 /* Realloc if we need. */
1202 if (set->alloc == set->nelem)
1205 set->alloc = set->alloc * 2;
1206 new_elems = re_realloc (set->elems, Idx, set->alloc);
1207 if (BE (new_elems == NULL, 0))
1209 set->elems = new_elems;
1212 /* Move the elements which follows the new element. Test the
1213 first element separately to skip a check in the inner loop. */
1214 if (elem < set->elems[0])
1217 for (idx = set->nelem; idx > 0; idx--)
1218 set->elems[idx] = set->elems[idx - 1];
1222 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1223 set->elems[idx] = set->elems[idx - 1];
1226 /* Insert the new element. */
1227 set->elems[idx] = elem;
1232 /* Insert the new element ELEM to the re_node_set* SET.
1233 SET should not already have any element greater than or equal to ELEM.
1234 Return -1 if an error is occured, return 1 otherwise. */
1238 re_node_set_insert_last (re_node_set *set, Idx elem)
1240 /* Realloc if we need. */
1241 if (set->alloc == set->nelem)
1244 set->alloc = (set->alloc + 1) * 2;
1245 new_elems = re_realloc (set->elems, Idx, set->alloc);
1246 if (BE (new_elems == NULL, 0))
1248 set->elems = new_elems;
1251 /* Insert the new element. */
1252 set->elems[set->nelem++] = elem;
1256 /* Compare two node sets SET1 and SET2.
1257 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1260 internal_function __attribute ((pure))
1261 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1264 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1266 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1267 if (set1->elems[i] != set2->elems[i])
1272 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1275 internal_function __attribute ((pure))
1276 re_node_set_contains (const re_node_set *set, Idx elem)
1278 __re_size_t idx, right, mid;
1279 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1282 /* Binary search the element. */
1284 right = set->nelem - 1;
1287 mid = (idx + right) / 2;
1288 if (set->elems[mid] < elem)
1293 return set->elems[idx] == elem ? idx + 1 : 0;
1298 re_node_set_remove_at (re_node_set *set, Idx idx)
1300 if (idx < 0 || idx >= set->nelem)
1303 for (; idx < set->nelem; idx++)
1304 set->elems[idx] = set->elems[idx + 1];
1308 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1309 Or return REG_MISSING if an error occurred. */
1313 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1315 int type = token.type;
1316 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1318 Idx new_nodes_alloc = dfa->nodes_alloc * 2;
1319 Idx *new_nexts, *new_indices;
1320 re_node_set *new_edests, *new_eclosures;
1322 re_token_t *new_nodes = re_realloc (dfa->nodes, re_token_t,
1324 if (BE (new_nodes == NULL, 0))
1326 dfa->nodes = new_nodes;
1327 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1328 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1329 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1330 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1331 if (BE (new_nexts == NULL || new_indices == NULL
1332 || new_edests == NULL || new_eclosures == NULL, 0))
1334 dfa->nexts = new_nexts;
1335 dfa->org_indices = new_indices;
1336 dfa->edests = new_edests;
1337 dfa->eclosures = new_eclosures;
1338 dfa->nodes_alloc = new_nodes_alloc;
1340 dfa->nodes[dfa->nodes_len] = token;
1341 dfa->nodes[dfa->nodes_len].constraint = 0;
1342 #ifdef RE_ENABLE_I18N
1343 dfa->nodes[dfa->nodes_len].accept_mb =
1344 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1346 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1347 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1348 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1349 return dfa->nodes_len++;
1352 static inline re_hashval_t
1354 calc_state_hash (const re_node_set *nodes, unsigned int context)
1356 re_hashval_t hash = nodes->nelem + context;
1358 for (i = 0 ; i < nodes->nelem ; i++)
1359 hash += nodes->elems[i];
1363 /* Search for the state whose node_set is equivalent to NODES.
1364 Return the pointer to the state, if we found it in the DFA.
1365 Otherwise create the new one and return it. In case of an error
1366 return NULL and set the error code in ERR.
1367 Note: - We assume NULL as the invalid state, then it is possible that
1368 return value is NULL and ERR is REG_NOERROR.
1369 - We never return non-NULL value in case of any errors, it is for
1372 static re_dfastate_t*
1374 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1377 re_dfastate_t *new_state;
1378 struct re_state_table_entry *spot;
1381 /* Suppress bogus uninitialized-variable warnings. */
1384 if (BE (nodes->nelem == 0, 0))
1389 hash = calc_state_hash (nodes, 0);
1390 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1392 for (i = 0 ; i < spot->num ; i++)
1394 re_dfastate_t *state = spot->array[i];
1395 if (hash != state->hash)
1397 if (re_node_set_compare (&state->nodes, nodes))
1401 /* There are no appropriate state in the dfa, create the new one. */
1402 new_state = create_ci_newstate (dfa, nodes, hash);
1403 if (BE (new_state != NULL, 1))
1412 /* Search for the state whose node_set is equivalent to NODES and
1413 whose context is equivalent to CONTEXT.
1414 Return the pointer to the state, if we found it in the DFA.
1415 Otherwise create the new one and return it. In case of an error
1416 return NULL and set the error code in ERR.
1417 Note: - We assume NULL as the invalid state, then it is possible that
1418 return value is NULL and ERR is REG_NOERROR.
1419 - We never return non-NULL value in case of any errors, it is for
1422 static re_dfastate_t*
1424 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1425 const re_node_set *nodes, unsigned int context)
1428 re_dfastate_t *new_state;
1429 struct re_state_table_entry *spot;
1432 /* Suppress bogus uninitialized-variable warnings. */
1435 if (nodes->nelem == 0)
1440 hash = calc_state_hash (nodes, context);
1441 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1443 for (i = 0 ; i < spot->num ; i++)
1445 re_dfastate_t *state = spot->array[i];
1446 if (state->hash == hash
1447 && state->context == context
1448 && re_node_set_compare (state->entrance_nodes, nodes))
1451 /* There are no appropriate state in `dfa', create the new one. */
1452 new_state = create_cd_newstate (dfa, nodes, context, hash);
1453 if (BE (new_state != NULL, 1))
1462 /* Finish initialization of the new state NEWSTATE, and using its hash value
1463 HASH put in the appropriate bucket of DFA's state table. Return value
1464 indicates the error code if failed. */
1466 static reg_errcode_t
1468 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1470 struct re_state_table_entry *spot;
1474 newstate->hash = hash;
1475 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1476 if (BE (err != REG_NOERROR, 0))
1478 for (i = 0; i < newstate->nodes.nelem; i++)
1480 Idx elem = newstate->nodes.elems[i];
1481 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1482 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1485 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1486 if (BE (spot->alloc <= spot->num, 0))
1488 Idx new_alloc = 2 * spot->num + 2;
1489 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1491 if (BE (new_array == NULL, 0))
1493 spot->array = new_array;
1494 spot->alloc = new_alloc;
1496 spot->array[spot->num++] = newstate;
1500 /* Create the new state which is independ of contexts.
1501 Return the new state if succeeded, otherwise return NULL. */
1503 static re_dfastate_t *
1505 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1510 re_dfastate_t *newstate;
1512 newstate = re_calloc (re_dfastate_t, 1);
1513 if (BE (newstate == NULL, 0))
1515 err = re_node_set_init_copy (&newstate->nodes, nodes);
1516 if (BE (err != REG_NOERROR, 0))
1522 newstate->entrance_nodes = &newstate->nodes;
1523 for (i = 0 ; i < nodes->nelem ; i++)
1525 re_token_t *node = dfa->nodes + nodes->elems[i];
1526 re_token_type_t type = node->type;
1527 if (type == CHARACTER && !node->constraint)
1529 #ifdef RE_ENABLE_I18N
1530 newstate->accept_mb |= node->accept_mb;
1531 #endif /* RE_ENABLE_I18N */
1533 /* If the state has the halt node, the state is a halt state. */
1534 if (type == END_OF_RE)
1536 else if (type == OP_BACK_REF)
1537 newstate->has_backref = 1;
1538 else if (type == ANCHOR || node->constraint)
1539 newstate->has_constraint = 1;
1541 err = register_state (dfa, newstate, hash);
1542 if (BE (err != REG_NOERROR, 0))
1544 free_state (newstate);
1550 /* Create the new state which is depend on the context CONTEXT.
1551 Return the new state if succeeded, otherwise return NULL. */
1553 static re_dfastate_t *
1555 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1556 unsigned int context, re_hashval_t hash)
1558 Idx i, nctx_nodes = 0;
1560 re_dfastate_t *newstate;
1562 newstate = re_calloc (re_dfastate_t, 1);
1563 if (BE (newstate == NULL, 0))
1565 err = re_node_set_init_copy (&newstate->nodes, nodes);
1566 if (BE (err != REG_NOERROR, 0))
1572 newstate->context = context;
1573 newstate->entrance_nodes = &newstate->nodes;
1575 for (i = 0 ; i < nodes->nelem ; i++)
1577 unsigned int constraint = 0;
1578 re_token_t *node = dfa->nodes + nodes->elems[i];
1579 re_token_type_t type = node->type;
1580 if (node->constraint)
1581 constraint = node->constraint;
1583 if (type == CHARACTER && !constraint)
1585 #ifdef RE_ENABLE_I18N
1586 newstate->accept_mb |= node->accept_mb;
1587 #endif /* RE_ENABLE_I18N */
1589 /* If the state has the halt node, the state is a halt state. */
1590 if (type == END_OF_RE)
1592 else if (type == OP_BACK_REF)
1593 newstate->has_backref = 1;
1594 else if (type == ANCHOR)
1595 constraint = node->opr.ctx_type;
1599 if (newstate->entrance_nodes == &newstate->nodes)
1601 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1602 if (BE (newstate->entrance_nodes == NULL, 0))
1604 free_state (newstate);
1607 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1609 newstate->has_constraint = 1;
1612 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1614 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1619 err = register_state (dfa, newstate, hash);
1620 if (BE (err != REG_NOERROR, 0))
1622 free_state (newstate);
1630 free_state (re_dfastate_t *state)
1632 re_node_set_free (&state->non_eps_nodes);
1633 re_node_set_free (&state->inveclosure);
1634 if (state->entrance_nodes != &state->nodes)
1636 re_node_set_free (state->entrance_nodes);
1637 re_free (state->entrance_nodes);
1639 re_node_set_free (&state->nodes);
1640 re_free (state->word_trtable);
1641 re_free (state->trtable);