1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static void re_string_construct_common (const char *str, int len,
22 RE_TRANSLATE_TYPE trans, int icase,
23 const re_dfa_t *dfa) internal_function;
25 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
26 wint_t *last_wc) internal_function;
27 #endif /* RE_ENABLE_I18N */
28 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
29 unsigned int hash) internal_function;
30 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
31 const re_node_set *nodes,
32 unsigned int hash) internal_function;
33 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
34 const re_node_set *nodes,
36 unsigned int hash) internal_function;
37 static unsigned int inline calc_state_hash (const re_node_set *nodes,
38 unsigned int context) internal_function;
40 /* Functions for string operation. */
42 /* This function allocate the buffers. It is necessary to call
43 re_string_reconstruct before using the object. */
47 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
48 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
53 /* Ensure at least one character fits into the buffers. */
54 if (init_len < dfa->mb_cur_max)
55 init_len = dfa->mb_cur_max;
56 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
57 re_string_construct_common (str, len, pstr, trans, icase, dfa);
59 ret = re_string_realloc_buffers (pstr, init_buf_len);
60 if (BE (ret != REG_NOERROR, 0))
63 pstr->word_char = dfa->word_char;
64 pstr->word_ops_used = dfa->word_ops_used;
65 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
66 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
67 pstr->valid_raw_len = pstr->valid_len;
71 /* This function allocate the buffers, and initialize them. */
75 re_string_construct (re_string_t *pstr, const char *str, int len,
76 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
79 memset (pstr, '\0', sizeof (re_string_t));
80 re_string_construct_common (str, len, pstr, trans, icase, dfa);
84 ret = re_string_realloc_buffers (pstr, len + 1);
85 if (BE (ret != REG_NOERROR, 0))
88 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
93 if (dfa->mb_cur_max > 1)
97 ret = build_wcs_upper_buffer (pstr);
98 if (BE (ret != REG_NOERROR, 0))
100 if (pstr->valid_raw_len >= len)
102 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
104 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
105 if (BE (ret != REG_NOERROR, 0))
110 #endif /* RE_ENABLE_I18N */
111 build_upper_buffer (pstr);
115 #ifdef RE_ENABLE_I18N
116 if (dfa->mb_cur_max > 1)
117 build_wcs_buffer (pstr);
119 #endif /* RE_ENABLE_I18N */
122 re_string_translate_buffer (pstr);
125 pstr->valid_len = pstr->bufs_len;
126 pstr->valid_raw_len = pstr->bufs_len;
134 /* Helper functions for re_string_allocate, and re_string_construct. */
138 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
140 #ifdef RE_ENABLE_I18N
141 if (pstr->mb_cur_max > 1)
143 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144 if (BE (new_wcs == NULL, 0))
147 if (pstr->offsets != NULL)
149 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
150 if (BE (new_offsets == NULL, 0))
152 pstr->offsets = new_offsets;
155 #endif /* RE_ENABLE_I18N */
156 if (pstr->mbs_allocated)
158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
160 if (BE (new_mbs == NULL, 0))
164 pstr->bufs_len = new_buf_len;
171 re_string_construct_common (const char *str, int len, re_string_t *pstr,
172 RE_TRANSLATE_TYPE trans, int icase,
175 pstr->raw_mbs = (const unsigned char *) str;
178 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
179 pstr->icase = icase ? 1 : 0;
180 pstr->mbs_allocated = (trans != NULL || icase);
181 pstr->mb_cur_max = dfa->mb_cur_max;
182 pstr->is_utf8 = dfa->is_utf8;
183 pstr->map_notascii = dfa->map_notascii;
184 pstr->stop = pstr->len;
185 pstr->raw_stop = pstr->stop;
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
203 build_wcs_buffer (re_string_t *pstr)
206 unsigned char buf[MB_LEN_MAX];
207 assert (MB_LEN_MAX >= pstr->mb_cur_max);
209 unsigned char buf[64];
212 int byte_idx, end_idx, remain_len;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
223 remain_len = end_idx - byte_idx;
224 prev_st = pstr->cur_state;
225 /* Apply the translation if we need. */
226 if (BE (pstr->trans != NULL, 0))
230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
235 p = (const char *) buf;
238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240 if (BE (mbclen == (size_t) -2, 0))
242 /* The buffer doesn't have enough space, finish to build. */
243 pstr->cur_state = prev_st;
246 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
248 /* We treat these cases as a singlebyte character. */
250 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
251 if (BE (pstr->trans != NULL, 0))
252 wc = pstr->trans[wc];
253 pstr->cur_state = prev_st;
256 /* Write wide character and padding. */
257 pstr->wcs[byte_idx++] = wc;
258 /* Write paddings. */
259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 pstr->wcs[byte_idx++] = WEOF;
262 pstr->valid_len = byte_idx;
263 pstr->valid_raw_len = byte_idx;
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267 but for REG_ICASE. */
271 build_wcs_upper_buffer (re_string_t *pstr)
274 int src_idx, byte_idx, end_idx, remain_len;
277 char buf[MB_LEN_MAX];
278 assert (MB_LEN_MAX >= pstr->mb_cur_max);
283 byte_idx = pstr->valid_len;
284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
286 /* The following optimization assumes that ASCII characters can be
287 mapped to wide characters with a simple cast. */
288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
290 while (byte_idx < end_idx)
294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 && mbsinit (&pstr->cur_state))
297 /* In case of a singlebyte character. */
299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 /* The next step uses the assumption that wchar_t is encoded
301 ASCII-safe: all ASCII values can be converted like this. */
302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
307 remain_len = end_idx - byte_idx;
308 prev_st = pstr->cur_state;
309 mbclen = mbrtowc (&wc,
310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 + byte_idx), remain_len, &pstr->cur_state);
312 if (BE (mbclen + 2 > 2, 1))
320 mbcdlen = wcrtomb (buf, wcu, &prev_st);
321 if (BE (mbclen == mbcdlen, 1))
322 memcpy (pstr->mbs + byte_idx, buf, mbclen);
330 memcpy (pstr->mbs + byte_idx,
331 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332 pstr->wcs[byte_idx++] = wcu;
333 /* Write paddings. */
334 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335 pstr->wcs[byte_idx++] = WEOF;
337 else if (mbclen == (size_t) -1 || mbclen == 0)
339 /* It is an invalid character or '\0'. Just use the byte. */
340 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341 pstr->mbs[byte_idx] = ch;
342 /* And also cast it to wide char. */
343 pstr->wcs[byte_idx++] = (wchar_t) ch;
344 if (BE (mbclen == (size_t) -1, 0))
345 pstr->cur_state = prev_st;
349 /* The buffer doesn't have enough space, finish to build. */
350 pstr->cur_state = prev_st;
354 pstr->valid_len = byte_idx;
355 pstr->valid_raw_len = byte_idx;
359 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364 remain_len = end_idx - byte_idx;
365 prev_st = pstr->cur_state;
366 if (BE (pstr->trans != NULL, 0))
370 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373 buf[i] = pstr->trans[ch];
375 p = (const char *) buf;
378 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380 if (BE (mbclen + 2 > 2, 1))
388 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389 if (BE (mbclen == mbcdlen, 1))
390 memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 else if (mbcdlen != (size_t) -1)
395 if (byte_idx + mbcdlen > pstr->bufs_len)
397 pstr->cur_state = prev_st;
401 if (pstr->offsets == NULL)
403 pstr->offsets = re_malloc (int, pstr->bufs_len);
405 if (pstr->offsets == NULL)
408 if (!pstr->offsets_needed)
410 for (i = 0; i < (size_t) byte_idx; ++i)
411 pstr->offsets[i] = i;
412 pstr->offsets_needed = 1;
415 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 pstr->wcs[byte_idx] = wcu;
417 pstr->offsets[byte_idx] = src_idx;
418 for (i = 1; i < mbcdlen; ++i)
420 pstr->offsets[byte_idx + i]
421 = src_idx + (i < mbclen ? i : mbclen - 1);
422 pstr->wcs[byte_idx + i] = WEOF;
424 pstr->len += mbcdlen - mbclen;
425 if (pstr->raw_stop > src_idx)
426 pstr->stop += mbcdlen - mbclen;
427 end_idx = (pstr->bufs_len > pstr->len)
428 ? pstr->len : pstr->bufs_len;
434 memcpy (pstr->mbs + byte_idx, p, mbclen);
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
439 if (BE (pstr->offsets_needed != 0, 0))
442 for (i = 0; i < mbclen; ++i)
443 pstr->offsets[byte_idx + i] = src_idx + i;
447 pstr->wcs[byte_idx++] = wcu;
448 /* Write paddings. */
449 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 pstr->wcs[byte_idx++] = WEOF;
452 else if (mbclen == (size_t) -1 || mbclen == 0)
454 /* It is an invalid character or '\0'. Just use the byte. */
455 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457 if (BE (pstr->trans != NULL, 0))
458 ch = pstr->trans [ch];
459 pstr->mbs[byte_idx] = ch;
461 if (BE (pstr->offsets_needed != 0, 0))
462 pstr->offsets[byte_idx] = src_idx;
465 /* And also cast it to wide char. */
466 pstr->wcs[byte_idx++] = (wchar_t) ch;
467 if (BE (mbclen == (size_t) -1, 0))
468 pstr->cur_state = prev_st;
472 /* The buffer doesn't have enough space, finish to build. */
473 pstr->cur_state = prev_st;
477 pstr->valid_len = byte_idx;
478 pstr->valid_raw_len = src_idx;
482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
494 /* Skip the characters which are not necessary to check. */
495 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496 rawbuf_idx < new_raw_idx;)
499 remain_len = pstr->len - rawbuf_idx;
500 prev_st = pstr->cur_state;
501 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
502 remain_len, &pstr->cur_state);
503 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505 /* We treat these cases as a singlebyte character. */
507 pstr->cur_state = prev_st;
509 /* Then proceed the next character. */
510 rawbuf_idx += mbclen;
512 *last_wc = (wint_t) wc;
515 #endif /* RE_ENABLE_I18N */
517 /* Build the buffer PSTR->MBS, and apply the translation if we need.
518 This function is used in case of REG_ICASE. */
522 build_upper_buffer (re_string_t *pstr)
524 int char_idx, end_idx;
525 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
527 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
529 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
530 if (BE (pstr->trans != NULL, 0))
531 ch = pstr->trans[ch];
533 pstr->mbs[char_idx] = toupper (ch);
535 pstr->mbs[char_idx] = ch;
537 pstr->valid_len = char_idx;
538 pstr->valid_raw_len = char_idx;
541 /* Apply TRANS to the buffer in PSTR. */
545 re_string_translate_buffer (re_string_t *pstr)
547 int buf_idx, end_idx;
548 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
550 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
552 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
553 pstr->mbs[buf_idx] = pstr->trans[ch];
556 pstr->valid_len = buf_idx;
557 pstr->valid_raw_len = buf_idx;
560 /* This function re-construct the buffers.
561 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
562 convert to upper case in case of REG_ICASE, apply translation. */
566 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
568 int offset = idx - pstr->raw_mbs_idx;
569 if (BE (offset < 0, 0))
572 #ifdef RE_ENABLE_I18N
573 if (pstr->mb_cur_max > 1)
574 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
575 #endif /* RE_ENABLE_I18N */
576 pstr->len = pstr->raw_len;
577 pstr->stop = pstr->raw_stop;
579 pstr->raw_mbs_idx = 0;
580 pstr->valid_raw_len = 0;
581 pstr->offsets_needed = 0;
582 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
583 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
584 if (!pstr->mbs_allocated)
585 pstr->mbs = (unsigned char *) pstr->raw_mbs;
589 if (BE (offset != 0, 1))
591 /* Are the characters which are already checked remain? */
592 if (BE (offset < pstr->valid_raw_len, 1)
593 #ifdef RE_ENABLE_I18N
594 /* Handling this would enlarge the code too much.
595 Accept a slowdown in that case. */
596 && pstr->offsets_needed == 0
600 /* Yes, move them to the front of the buffer. */
601 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
602 #ifdef RE_ENABLE_I18N
603 if (pstr->mb_cur_max > 1)
604 memmove (pstr->wcs, pstr->wcs + offset,
605 (pstr->valid_len - offset) * sizeof (wint_t));
606 #endif /* RE_ENABLE_I18N */
607 if (BE (pstr->mbs_allocated, 0))
608 memmove (pstr->mbs, pstr->mbs + offset,
609 pstr->valid_len - offset);
610 pstr->valid_len -= offset;
611 pstr->valid_raw_len -= offset;
613 assert (pstr->valid_len > 0);
618 /* No, skip all characters until IDX. */
619 #ifdef RE_ENABLE_I18N
620 if (BE (pstr->offsets_needed, 0))
622 pstr->len = pstr->raw_len - idx + offset;
623 pstr->stop = pstr->raw_stop - idx + offset;
624 pstr->offsets_needed = 0;
628 pstr->valid_raw_len = 0;
629 #ifdef RE_ENABLE_I18N
630 if (pstr->mb_cur_max > 1)
637 const unsigned char *raw, *p, *q, *end;
639 /* Special case UTF-8. Multi-byte chars start with any
640 byte other than 0x80 - 0xbf. */
641 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
642 end = raw + (offset - pstr->mb_cur_max);
643 for (p = raw + offset - 1; p >= end; --p)
644 if ((*p & 0xc0) != 0x80)
648 int mlen = raw + pstr->len - p;
649 unsigned char buf[6];
652 if (BE (pstr->trans != NULL, 0))
654 int i = mlen < 6 ? mlen : 6;
656 buf[i] = pstr->trans[p[i]];
659 /* XXX Don't use mbrtowc, we know which conversion
660 to use (UTF-8 -> UCS4). */
661 memset (&cur_state, 0, sizeof (cur_state));
662 mlen = (mbrtowc (&wc2, (const char *) p, mlen,
664 - (raw + offset - p));
667 memset (&pstr->cur_state, '\0',
669 pstr->valid_len = mlen;
677 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
678 if (BE (pstr->valid_len, 0))
680 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
681 pstr->wcs[wcs_idx] = WEOF;
682 if (pstr->mbs_allocated)
683 memset (pstr->mbs, 255, pstr->valid_len);
685 pstr->valid_raw_len = pstr->valid_len;
686 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
687 && IS_WIDE_WORD_CHAR (wc))
689 : ((IS_WIDE_NEWLINE (wc)
690 && pstr->newline_anchor)
691 ? CONTEXT_NEWLINE : 0));
694 #endif /* RE_ENABLE_I18N */
696 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
699 pstr->tip_context = (bitset_contain (pstr->word_char, c)
701 : ((IS_NEWLINE (c) && pstr->newline_anchor)
702 ? CONTEXT_NEWLINE : 0));
705 if (!BE (pstr->mbs_allocated, 0))
708 pstr->raw_mbs_idx = idx;
710 pstr->stop -= offset;
712 /* Then build the buffers. */
713 #ifdef RE_ENABLE_I18N
714 if (pstr->mb_cur_max > 1)
718 int ret = build_wcs_upper_buffer (pstr);
719 if (BE (ret != REG_NOERROR, 0))
723 build_wcs_buffer (pstr);
726 #endif /* RE_ENABLE_I18N */
727 if (BE (pstr->mbs_allocated, 0))
730 build_upper_buffer (pstr);
731 else if (pstr->trans != NULL)
732 re_string_translate_buffer (pstr);
735 pstr->valid_len = pstr->len;
743 re_string_peek_byte_case (const re_string_t *pstr, int idx)
747 /* Handle the common (easiest) cases first. */
748 if (BE (!pstr->mbs_allocated, 1))
749 return re_string_peek_byte (pstr, idx);
751 #ifdef RE_ENABLE_I18N
752 if (pstr->mb_cur_max > 1
753 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
754 return re_string_peek_byte (pstr, idx);
757 off = pstr->cur_idx + idx;
758 #ifdef RE_ENABLE_I18N
759 if (pstr->offsets_needed)
760 off = pstr->offsets[off];
763 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
765 #ifdef RE_ENABLE_I18N
766 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
767 this function returns CAPITAL LETTER I instead of first byte of
768 DOTLESS SMALL LETTER I. The latter would confuse the parser,
769 since peek_byte_case doesn't advance cur_idx in any way. */
770 if (pstr->offsets_needed && !isascii (ch))
771 return re_string_peek_byte (pstr, idx);
779 re_string_fetch_byte_case (re_string_t *pstr)
781 if (BE (!pstr->mbs_allocated, 1))
782 return re_string_fetch_byte (pstr);
784 #ifdef RE_ENABLE_I18N
785 if (pstr->offsets_needed)
789 /* For tr_TR.UTF-8 [[:islower:]] there is
790 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
791 in that case the whole multi-byte character and return
792 the original letter. On the other side, with
793 [[: DOTLESS SMALL LETTER I return [[:I, as doing
794 anything else would complicate things too much. */
796 if (!re_string_first_byte (pstr, pstr->cur_idx))
797 return re_string_fetch_byte (pstr);
799 off = pstr->offsets[pstr->cur_idx];
800 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
803 return re_string_fetch_byte (pstr);
805 re_string_skip_bytes (pstr,
806 re_string_char_size_at (pstr, pstr->cur_idx));
811 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
816 re_string_destruct (re_string_t *pstr)
818 #ifdef RE_ENABLE_I18N
820 re_free (pstr->offsets);
821 #endif /* RE_ENABLE_I18N */
822 if (pstr->mbs_allocated)
826 /* Return the context at IDX in INPUT. */
830 re_string_context_at (const re_string_t *input, int idx, int eflags)
834 /* In this case, we use the value stored in input->tip_context,
835 since we can't know the character in input->mbs[-1] here. */
836 return input->tip_context;
837 if (BE (idx == input->len, 0))
838 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
839 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
840 #ifdef RE_ENABLE_I18N
841 if (input->mb_cur_max > 1)
845 while(input->wcs[wc_idx] == WEOF)
848 /* It must not happen. */
849 assert (wc_idx >= 0);
853 return input->tip_context;
855 wc = input->wcs[wc_idx];
856 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
858 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
859 ? CONTEXT_NEWLINE : 0);
864 c = re_string_byte_at (input, idx);
865 if (bitset_contain (input->word_char, c))
867 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
871 /* Functions for set operation. */
875 re_node_set_alloc (re_node_set *set, int size)
879 set->elems = re_malloc (int, size);
880 if (BE (set->elems == NULL, 0))
887 re_node_set_init_1 (re_node_set *set, int elem)
891 set->elems = re_malloc (int, 1);
892 if (BE (set->elems == NULL, 0))
894 set->alloc = set->nelem = 0;
897 set->elems[0] = elem;
903 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
906 set->elems = re_malloc (int, 2);
907 if (BE (set->elems == NULL, 0))
912 set->elems[0] = elem1;
919 set->elems[0] = elem1;
920 set->elems[1] = elem2;
924 set->elems[0] = elem2;
925 set->elems[1] = elem1;
933 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
935 dest->nelem = src->nelem;
938 dest->alloc = dest->nelem;
939 dest->elems = re_malloc (int, dest->alloc);
940 if (BE (dest->elems == NULL, 0))
942 dest->alloc = dest->nelem = 0;
945 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
948 re_node_set_init_empty (dest);
952 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
953 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
954 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
958 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
959 const re_node_set *src2)
961 int i1, i2, is, id, delta, sbase;
962 if (src1->nelem == 0 || src2->nelem == 0)
965 /* We need dest->nelem + 2 * elems_in_intersection; this is a
966 conservative estimate. */
967 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
969 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
970 int *new_elems = re_realloc (dest->elems, int, new_alloc);
971 if (BE (new_elems == NULL, 0))
973 dest->elems = new_elems;
974 dest->alloc = new_alloc;
977 /* Find the items in the intersection of SRC1 and SRC2, and copy
978 into the top of DEST those that are not already in DEST itself. */
979 sbase = dest->nelem + src1->nelem + src2->nelem;
980 i1 = src1->nelem - 1;
981 i2 = src2->nelem - 1;
982 id = dest->nelem - 1;
985 if (src1->elems[i1] == src2->elems[i2])
987 /* Try to find the item in DEST. Maybe we could binary search? */
988 while (id >= 0 && dest->elems[id] > src1->elems[i1])
991 if (id < 0 || dest->elems[id] != src1->elems[i1])
992 dest->elems[--sbase] = src1->elems[i1];
994 if (--i1 < 0 || --i2 < 0)
998 /* Lower the highest of the two items. */
999 else if (src1->elems[i1] < src2->elems[i2])
1011 id = dest->nelem - 1;
1012 is = dest->nelem + src1->nelem + src2->nelem - 1;
1013 delta = is - sbase + 1;
1015 /* Now copy. When DELTA becomes zero, the remaining
1016 DEST elements are already in place; this is more or
1017 less the same loop that is in re_node_set_merge. */
1018 dest->nelem += delta;
1019 if (delta > 0 && id >= 0)
1022 if (dest->elems[is] > dest->elems[id])
1024 /* Copy from the top. */
1025 dest->elems[id + delta--] = dest->elems[is--];
1031 /* Slide from the bottom. */
1032 dest->elems[id + delta] = dest->elems[id];
1038 /* Copy remaining SRC elements. */
1039 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1044 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1045 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1047 static reg_errcode_t
1049 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1050 const re_node_set *src2)
1053 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1055 dest->alloc = src1->nelem + src2->nelem;
1056 dest->elems = re_malloc (int, dest->alloc);
1057 if (BE (dest->elems == NULL, 0))
1062 if (src1 != NULL && src1->nelem > 0)
1063 return re_node_set_init_copy (dest, src1);
1064 else if (src2 != NULL && src2->nelem > 0)
1065 return re_node_set_init_copy (dest, src2);
1067 re_node_set_init_empty (dest);
1070 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1072 if (src1->elems[i1] > src2->elems[i2])
1074 dest->elems[id++] = src2->elems[i2++];
1077 if (src1->elems[i1] == src2->elems[i2])
1079 dest->elems[id++] = src1->elems[i1++];
1081 if (i1 < src1->nelem)
1083 memcpy (dest->elems + id, src1->elems + i1,
1084 (src1->nelem - i1) * sizeof (int));
1085 id += src1->nelem - i1;
1087 else if (i2 < src2->nelem)
1089 memcpy (dest->elems + id, src2->elems + i2,
1090 (src2->nelem - i2) * sizeof (int));
1091 id += src2->nelem - i2;
1097 /* Calculate the union set of the sets DEST and SRC. And store it to
1098 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1100 static reg_errcode_t
1102 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1104 int is, id, sbase, delta;
1105 if (src == NULL || src->nelem == 0)
1107 if (dest->alloc < 2 * src->nelem + dest->nelem)
1109 int new_alloc = 2 * (src->nelem + dest->alloc);
1110 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1111 if (BE (new_buffer == NULL, 0))
1113 dest->elems = new_buffer;
1114 dest->alloc = new_alloc;
1117 if (BE (dest->nelem == 0, 0))
1119 dest->nelem = src->nelem;
1120 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1124 /* Copy into the top of DEST the items of SRC that are not
1125 found in DEST. Maybe we could binary search in DEST? */
1126 for (sbase = dest->nelem + 2 * src->nelem,
1127 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1129 if (dest->elems[id] == src->elems[is])
1131 else if (dest->elems[id] < src->elems[is])
1132 dest->elems[--sbase] = src->elems[is--];
1133 else /* if (dest->elems[id] > src->elems[is]) */
1139 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1141 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1144 id = dest->nelem - 1;
1145 is = dest->nelem + 2 * src->nelem - 1;
1146 delta = is - sbase + 1;
1150 /* Now copy. When DELTA becomes zero, the remaining
1151 DEST elements are already in place. */
1152 dest->nelem += delta;
1155 if (dest->elems[is] > dest->elems[id])
1157 /* Copy from the top. */
1158 dest->elems[id + delta--] = dest->elems[is--];
1164 /* Slide from the bottom. */
1165 dest->elems[id + delta] = dest->elems[id];
1168 /* Copy remaining SRC elements. */
1169 memcpy (dest->elems, dest->elems + sbase,
1170 delta * sizeof (int));
1179 /* Insert the new element ELEM to the re_node_set* SET.
1180 SET should not already have ELEM.
1181 return -1 if an error is occured, return 1 otherwise. */
1185 re_node_set_insert (re_node_set *set, int elem)
1188 /* In case the set is empty. */
1189 if (set->alloc == 0)
1191 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1197 if (BE (set->nelem, 0) == 0)
1199 /* We already guaranteed above that set->alloc != 0. */
1200 set->elems[0] = elem;
1205 /* Realloc if we need. */
1206 if (set->alloc == set->nelem)
1209 set->alloc = set->alloc * 2;
1210 new_elems = re_realloc (set->elems, int, set->alloc);
1211 if (BE (new_elems == NULL, 0))
1213 set->elems = new_elems;
1216 /* Move the elements which follows the new element. Test the
1217 first element separately to skip a check in the inner loop. */
1218 if (elem < set->elems[0])
1221 for (idx = set->nelem; idx > 0; idx--)
1222 set->elems[idx] = set->elems[idx - 1];
1226 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1227 set->elems[idx] = set->elems[idx - 1];
1230 /* Insert the new element. */
1231 set->elems[idx] = elem;
1236 /* Insert the new element ELEM to the re_node_set* SET.
1237 SET should not already have any element greater than or equal to ELEM.
1238 Return -1 if an error is occured, return 1 otherwise. */
1242 re_node_set_insert_last (re_node_set *set, int elem)
1244 /* Realloc if we need. */
1245 if (set->alloc == set->nelem)
1248 set->alloc = (set->alloc + 1) * 2;
1249 new_elems = re_realloc (set->elems, int, set->alloc);
1250 if (BE (new_elems == NULL, 0))
1252 set->elems = new_elems;
1255 /* Insert the new element. */
1256 set->elems[set->nelem++] = elem;
1260 /* Compare two node sets SET1 and SET2.
1261 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1265 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1268 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1270 for (i = set1->nelem ; --i >= 0 ; )
1271 if (set1->elems[i] != set2->elems[i])
1276 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1280 re_node_set_contains (const re_node_set *set, int elem)
1282 unsigned int idx, right, mid;
1283 if (set->nelem <= 0)
1286 /* Binary search the element. */
1288 right = set->nelem - 1;
1291 mid = (idx + right) / 2;
1292 if (set->elems[mid] < elem)
1297 return set->elems[idx] == elem ? idx + 1 : 0;
1302 re_node_set_remove_at (re_node_set *set, int idx)
1304 if (idx < 0 || idx >= set->nelem)
1307 for (; idx < set->nelem; idx++)
1308 set->elems[idx] = set->elems[idx + 1];
1312 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1313 Or return -1, if an error will be occured. */
1317 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1319 int type = token.type;
1320 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1322 int new_nodes_alloc = dfa->nodes_alloc * 2;
1323 int *new_nexts, *new_indices;
1324 re_node_set *new_edests, *new_eclosures;
1326 re_token_t *new_nodes = re_realloc (dfa->nodes, re_token_t,
1328 if (BE (new_nodes == NULL, 0))
1330 dfa->nodes = new_nodes;
1331 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1332 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1333 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1334 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1335 if (BE (new_nexts == NULL || new_indices == NULL
1336 || new_edests == NULL || new_eclosures == NULL, 0))
1338 dfa->nexts = new_nexts;
1339 dfa->org_indices = new_indices;
1340 dfa->edests = new_edests;
1341 dfa->eclosures = new_eclosures;
1342 dfa->nodes_alloc = new_nodes_alloc;
1344 dfa->nodes[dfa->nodes_len] = token;
1345 dfa->nodes[dfa->nodes_len].constraint = 0;
1346 #ifdef RE_ENABLE_I18N
1347 dfa->nodes[dfa->nodes_len].accept_mb =
1348 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1350 dfa->nexts[dfa->nodes_len] = -1;
1351 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1352 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1353 return dfa->nodes_len++;
1356 static unsigned int inline
1358 calc_state_hash (const re_node_set *nodes, unsigned int context)
1360 unsigned int hash = nodes->nelem + context;
1362 for (i = 0 ; i < nodes->nelem ; i++)
1363 hash += nodes->elems[i];
1367 /* Search for the state whose node_set is equivalent to NODES.
1368 Return the pointer to the state, if we found it in the DFA.
1369 Otherwise create the new one and return it. In case of an error
1370 return NULL and set the error code in ERR.
1371 Note: - We assume NULL as the invalid state, then it is possible that
1372 return value is NULL and ERR is REG_NOERROR.
1373 - We never return non-NULL value in case of any errors, it is for
1376 static re_dfastate_t*
1378 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1381 re_dfastate_t *new_state;
1382 struct re_state_table_entry *spot;
1385 /* Suppress bogus uninitialized-variable warnings. */
1388 if (BE (nodes->nelem == 0, 0))
1393 hash = calc_state_hash (nodes, 0);
1394 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1396 for (i = 0 ; i < spot->num ; i++)
1398 re_dfastate_t *state = spot->array[i];
1399 if (hash != state->hash)
1401 if (re_node_set_compare (&state->nodes, nodes))
1405 /* There are no appropriate state in the dfa, create the new one. */
1406 new_state = create_ci_newstate (dfa, nodes, hash);
1407 if (BE (new_state != NULL, 1))
1416 /* Search for the state whose node_set is equivalent to NODES and
1417 whose context is equivalent to CONTEXT.
1418 Return the pointer to the state, if we found it in the DFA.
1419 Otherwise create the new one and return it. In case of an error
1420 return NULL and set the error code in ERR.
1421 Note: - We assume NULL as the invalid state, then it is possible that
1422 return value is NULL and ERR is REG_NOERROR.
1423 - We never return non-NULL value in case of any errors, it is for
1426 static re_dfastate_t*
1428 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1429 const re_node_set *nodes, unsigned int context)
1432 re_dfastate_t *new_state;
1433 struct re_state_table_entry *spot;
1436 /* Suppress bogus uninitialized-variable warnings. */
1439 if (nodes->nelem == 0)
1444 hash = calc_state_hash (nodes, context);
1445 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1447 for (i = 0 ; i < spot->num ; i++)
1449 re_dfastate_t *state = spot->array[i];
1450 if (state->hash == hash
1451 && state->context == context
1452 && re_node_set_compare (state->entrance_nodes, nodes))
1455 /* There are no appropriate state in `dfa', create the new one. */
1456 new_state = create_cd_newstate (dfa, nodes, context, hash);
1457 if (BE (new_state != NULL, 1))
1466 /* Finish initialization of the new state NEWSTATE, and using its hash value
1467 HASH put in the appropriate bucket of DFA's state table. Return value
1468 indicates the error code if failed. */
1470 static reg_errcode_t
1472 register_state (re_dfa_t *dfa, re_dfastate_t *newstate, unsigned int hash)
1474 struct re_state_table_entry *spot;
1478 newstate->hash = hash;
1479 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1480 if (BE (err != REG_NOERROR, 0))
1482 for (i = 0; i < newstate->nodes.nelem; i++)
1484 int elem = newstate->nodes.elems[i];
1485 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1486 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1489 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1490 if (BE (spot->alloc <= spot->num, 0))
1492 int new_alloc = 2 * spot->num + 2;
1493 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1495 if (BE (new_array == NULL, 0))
1497 spot->array = new_array;
1498 spot->alloc = new_alloc;
1500 spot->array[spot->num++] = newstate;
1504 /* Create the new state which is independ of contexts.
1505 Return the new state if succeeded, otherwise return NULL. */
1507 static re_dfastate_t *
1509 create_ci_newstate (re_dfa_t *dfa, const re_node_set *nodes, unsigned int hash)
1513 re_dfastate_t *newstate;
1515 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1516 if (BE (newstate == NULL, 0))
1518 err = re_node_set_init_copy (&newstate->nodes, nodes);
1519 if (BE (err != REG_NOERROR, 0))
1525 newstate->entrance_nodes = &newstate->nodes;
1526 for (i = 0 ; i < nodes->nelem ; i++)
1528 re_token_t *node = dfa->nodes + nodes->elems[i];
1529 re_token_type_t type = node->type;
1530 if (type == CHARACTER && !node->constraint)
1532 #ifdef RE_ENABLE_I18N
1533 newstate->accept_mb |= node->accept_mb;
1534 #endif /* RE_ENABLE_I18N */
1536 /* If the state has the halt node, the state is a halt state. */
1537 if (type == END_OF_RE)
1539 else if (type == OP_BACK_REF)
1540 newstate->has_backref = 1;
1541 else if (type == ANCHOR || node->constraint)
1542 newstate->has_constraint = 1;
1544 err = register_state (dfa, newstate, hash);
1545 if (BE (err != REG_NOERROR, 0))
1547 free_state (newstate);
1553 /* Create the new state which is depend on the context CONTEXT.
1554 Return the new state if succeeded, otherwise return NULL. */
1556 static re_dfastate_t *
1558 create_cd_newstate (re_dfa_t *dfa, const re_node_set *nodes,
1559 unsigned int context, unsigned int hash)
1561 int i, nctx_nodes = 0;
1563 re_dfastate_t *newstate;
1565 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1566 if (BE (newstate == NULL, 0))
1568 err = re_node_set_init_copy (&newstate->nodes, nodes);
1569 if (BE (err != REG_NOERROR, 0))
1575 newstate->context = context;
1576 newstate->entrance_nodes = &newstate->nodes;
1578 for (i = 0 ; i < nodes->nelem ; i++)
1580 unsigned int constraint = 0;
1581 re_token_t *node = dfa->nodes + nodes->elems[i];
1582 re_token_type_t type = node->type;
1583 if (node->constraint)
1584 constraint = node->constraint;
1586 if (type == CHARACTER && !constraint)
1588 #ifdef RE_ENABLE_I18N
1589 newstate->accept_mb |= node->accept_mb;
1590 #endif /* RE_ENABLE_I18N */
1592 /* If the state has the halt node, the state is a halt state. */
1593 if (type == END_OF_RE)
1595 else if (type == OP_BACK_REF)
1596 newstate->has_backref = 1;
1597 else if (type == ANCHOR)
1598 constraint = node->opr.ctx_type;
1602 if (newstate->entrance_nodes == &newstate->nodes)
1604 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1605 if (BE (newstate->entrance_nodes == NULL, 0))
1607 free_state (newstate);
1610 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1612 newstate->has_constraint = 1;
1615 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1617 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1622 err = register_state (dfa, newstate, hash);
1623 if (BE (err != REG_NOERROR, 0))
1625 free_state (newstate);
1633 free_state (re_dfastate_t *state)
1635 re_node_set_free (&state->non_eps_nodes);
1636 re_node_set_free (&state->inveclosure);
1637 if (state->entrance_nodes != &state->nodes)
1639 re_node_set_free (state->entrance_nodes);
1640 re_free (state->entrance_nodes);
1642 re_node_set_free (&state->nodes);
1643 re_free (state->word_trtable);
1644 re_free (state->trtable);