1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static void re_string_construct_common (const char *str, int len,
22 RE_TRANSLATE_TYPE trans, int icase,
23 const re_dfa_t *dfa) internal_function;
25 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
26 wint_t *last_wc) internal_function;
27 #endif /* RE_ENABLE_I18N */
28 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
29 unsigned int hash) internal_function;
30 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
31 const re_node_set *nodes,
32 unsigned int hash) internal_function;
33 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
34 const re_node_set *nodes,
36 unsigned int hash) internal_function;
37 static unsigned int inline calc_state_hash (const re_node_set *nodes,
38 unsigned int context) internal_function;
40 /* Functions for string operation. */
42 /* This function allocate the buffers. It is necessary to call
43 re_string_reconstruct before using the object. */
46 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
49 int len, init_len, icase;
50 RE_TRANSLATE_TYPE trans;
56 /* Ensure at least one character fits into the buffers. */
57 if (init_len < dfa->mb_cur_max)
58 init_len = dfa->mb_cur_max;
59 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
60 re_string_construct_common (str, len, pstr, trans, icase, dfa);
62 ret = re_string_realloc_buffers (pstr, init_buf_len);
63 if (BE (ret != REG_NOERROR, 0))
66 pstr->word_char = dfa->word_char;
67 pstr->word_ops_used = dfa->word_ops_used;
68 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
69 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
70 pstr->valid_raw_len = pstr->valid_len;
74 /* This function allocate the buffers, and initialize them. */
77 re_string_construct (pstr, str, len, trans, icase, dfa)
81 RE_TRANSLATE_TYPE trans;
85 memset (pstr, '\0', sizeof (re_string_t));
86 re_string_construct_common (str, len, pstr, trans, icase, dfa);
90 ret = re_string_realloc_buffers (pstr, len + 1);
91 if (BE (ret != REG_NOERROR, 0))
94 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
99 if (dfa->mb_cur_max > 1)
103 ret = build_wcs_upper_buffer (pstr);
104 if (BE (ret != REG_NOERROR, 0))
106 if (pstr->valid_raw_len >= len)
108 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
110 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
111 if (BE (ret != REG_NOERROR, 0))
116 #endif /* RE_ENABLE_I18N */
117 build_upper_buffer (pstr);
121 #ifdef RE_ENABLE_I18N
122 if (dfa->mb_cur_max > 1)
123 build_wcs_buffer (pstr);
125 #endif /* RE_ENABLE_I18N */
128 re_string_translate_buffer (pstr);
131 pstr->valid_len = pstr->bufs_len;
132 pstr->valid_raw_len = pstr->bufs_len;
140 /* Helper functions for re_string_allocate, and re_string_construct. */
143 re_string_realloc_buffers (pstr, new_buf_len)
147 #ifdef RE_ENABLE_I18N
148 if (pstr->mb_cur_max > 1)
150 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
151 if (BE (new_array == NULL, 0))
153 pstr->wcs = new_array;
154 if (pstr->offsets != NULL)
156 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
157 if (BE (new_array == NULL, 0))
159 pstr->offsets = new_array;
162 #endif /* RE_ENABLE_I18N */
163 if (pstr->mbs_allocated)
165 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
167 if (BE (new_array == NULL, 0))
169 pstr->mbs = new_array;
171 pstr->bufs_len = new_buf_len;
177 re_string_construct_common (str, len, pstr, trans, icase, dfa)
181 RE_TRANSLATE_TYPE trans;
185 pstr->raw_mbs = (const unsigned char *) str;
188 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
189 pstr->icase = icase ? 1 : 0;
190 pstr->mbs_allocated = (trans != NULL || icase);
191 pstr->mb_cur_max = dfa->mb_cur_max;
192 pstr->is_utf8 = dfa->is_utf8;
193 pstr->map_notascii = dfa->map_notascii;
194 pstr->stop = pstr->len;
195 pstr->raw_stop = pstr->stop;
198 #ifdef RE_ENABLE_I18N
200 /* Build wide character buffer PSTR->WCS.
201 If the byte sequence of the string are:
202 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
203 Then wide character buffer will be:
204 <wc1> , WEOF , <wc2> , WEOF , <wc3>
205 We use WEOF for padding, they indicate that the position isn't
206 a first byte of a multibyte character.
208 Note that this function assumes PSTR->VALID_LEN elements are already
209 built and starts from PSTR->VALID_LEN. */
212 build_wcs_buffer (pstr)
216 unsigned char buf[MB_LEN_MAX];
217 assert (MB_LEN_MAX >= pstr->mb_cur_max);
219 unsigned char buf[64];
222 int byte_idx, end_idx, remain_len;
225 /* Build the buffers from pstr->valid_len to either pstr->len or
227 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
228 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
233 remain_len = end_idx - byte_idx;
234 prev_st = pstr->cur_state;
235 /* Apply the translation if we need. */
236 if (BE (pstr->trans != NULL, 0))
240 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
242 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
243 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
245 p = (const char *) buf;
248 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
249 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
250 if (BE (mbclen == (size_t) -2, 0))
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr->cur_state = prev_st;
256 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
258 /* We treat these cases as a singlebyte character. */
260 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
261 if (BE (pstr->trans != NULL, 0))
262 wc = pstr->trans[wc];
263 pstr->cur_state = prev_st;
266 /* Write wide character and padding. */
267 pstr->wcs[byte_idx++] = wc;
268 /* Write paddings. */
269 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270 pstr->wcs[byte_idx++] = WEOF;
272 pstr->valid_len = byte_idx;
273 pstr->valid_raw_len = byte_idx;
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
280 build_wcs_upper_buffer (pstr)
284 int src_idx, byte_idx, end_idx, remain_len;
287 char buf[MB_LEN_MAX];
288 assert (MB_LEN_MAX >= pstr->mb_cur_max);
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
296 /* The following optimization assumes that ASCII characters can be
297 mapped to wide characters with a simple cast. */
298 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
300 while (byte_idx < end_idx)
304 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
305 && mbsinit (&pstr->cur_state))
307 /* In case of a singlebyte character. */
309 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
310 /* The next step uses the assumption that wchar_t is encoded
311 ASCII-safe: all ASCII values can be converted like this. */
312 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
317 remain_len = end_idx - byte_idx;
318 prev_st = pstr->cur_state;
319 mbclen = mbrtowc (&wc,
320 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
321 + byte_idx), remain_len, &pstr->cur_state);
322 if (BE (mbclen + 2 > 2, 1))
330 mbcdlen = wcrtomb (buf, wcu, &prev_st);
331 if (BE (mbclen == mbcdlen, 1))
332 memcpy (pstr->mbs + byte_idx, buf, mbclen);
340 memcpy (pstr->mbs + byte_idx,
341 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
342 pstr->wcs[byte_idx++] = wcu;
343 /* Write paddings. */
344 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
345 pstr->wcs[byte_idx++] = WEOF;
347 else if (mbclen == (size_t) -1 || mbclen == 0)
349 /* It is an invalid character or '\0'. Just use the byte. */
350 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
351 pstr->mbs[byte_idx] = ch;
352 /* And also cast it to wide char. */
353 pstr->wcs[byte_idx++] = (wchar_t) ch;
354 if (BE (mbclen == (size_t) -1, 0))
355 pstr->cur_state = prev_st;
359 /* The buffer doesn't have enough space, finish to build. */
360 pstr->cur_state = prev_st;
364 pstr->valid_len = byte_idx;
365 pstr->valid_raw_len = byte_idx;
369 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
374 remain_len = end_idx - byte_idx;
375 prev_st = pstr->cur_state;
376 if (BE (pstr->trans != NULL, 0))
380 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
382 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
383 buf[i] = pstr->trans[ch];
385 p = (const char *) buf;
388 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
389 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
390 if (BE (mbclen + 2 > 2, 1))
398 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
399 if (BE (mbclen == mbcdlen, 1))
400 memcpy (pstr->mbs + byte_idx, buf, mbclen);
401 else if (mbcdlen != (size_t) -1)
405 if (byte_idx + mbcdlen > pstr->bufs_len)
407 pstr->cur_state = prev_st;
411 if (pstr->offsets == NULL)
413 pstr->offsets = re_malloc (int, pstr->bufs_len);
415 if (pstr->offsets == NULL)
418 if (!pstr->offsets_needed)
420 for (i = 0; i < (size_t) byte_idx; ++i)
421 pstr->offsets[i] = i;
422 pstr->offsets_needed = 1;
425 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
426 pstr->wcs[byte_idx] = wcu;
427 pstr->offsets[byte_idx] = src_idx;
428 for (i = 1; i < mbcdlen; ++i)
430 pstr->offsets[byte_idx + i]
431 = src_idx + (i < mbclen ? i : mbclen - 1);
432 pstr->wcs[byte_idx + i] = WEOF;
434 pstr->len += mbcdlen - mbclen;
435 if (pstr->raw_stop > src_idx)
436 pstr->stop += mbcdlen - mbclen;
437 end_idx = (pstr->bufs_len > pstr->len)
438 ? pstr->len : pstr->bufs_len;
444 memcpy (pstr->mbs + byte_idx, p, mbclen);
447 memcpy (pstr->mbs + byte_idx, p, mbclen);
449 if (BE (pstr->offsets_needed != 0, 0))
452 for (i = 0; i < mbclen; ++i)
453 pstr->offsets[byte_idx + i] = src_idx + i;
457 pstr->wcs[byte_idx++] = wcu;
458 /* Write paddings. */
459 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
460 pstr->wcs[byte_idx++] = WEOF;
462 else if (mbclen == (size_t) -1 || mbclen == 0)
464 /* It is an invalid character or '\0'. Just use the byte. */
465 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
467 if (BE (pstr->trans != NULL, 0))
468 ch = pstr->trans [ch];
469 pstr->mbs[byte_idx] = ch;
471 if (BE (pstr->offsets_needed != 0, 0))
472 pstr->offsets[byte_idx] = src_idx;
475 /* And also cast it to wide char. */
476 pstr->wcs[byte_idx++] = (wchar_t) ch;
477 if (BE (mbclen == (size_t) -1, 0))
478 pstr->cur_state = prev_st;
482 /* The buffer doesn't have enough space, finish to build. */
483 pstr->cur_state = prev_st;
487 pstr->valid_len = byte_idx;
488 pstr->valid_raw_len = src_idx;
492 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
496 re_string_skip_chars (pstr, new_raw_idx, last_wc)
506 /* Skip the characters which are not necessary to check. */
507 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
508 rawbuf_idx < new_raw_idx;)
511 remain_len = pstr->len - rawbuf_idx;
512 prev_st = pstr->cur_state;
513 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
514 remain_len, &pstr->cur_state);
515 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
517 /* We treat these cases as a singlebyte character. */
519 pstr->cur_state = prev_st;
521 /* Then proceed the next character. */
522 rawbuf_idx += mbclen;
524 *last_wc = (wint_t) wc;
527 #endif /* RE_ENABLE_I18N */
529 /* Build the buffer PSTR->MBS, and apply the translation if we need.
530 This function is used in case of REG_ICASE. */
533 build_upper_buffer (pstr)
536 int char_idx, end_idx;
537 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
539 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
541 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
542 if (BE (pstr->trans != NULL, 0))
543 ch = pstr->trans[ch];
545 pstr->mbs[char_idx] = toupper (ch);
547 pstr->mbs[char_idx] = ch;
549 pstr->valid_len = char_idx;
550 pstr->valid_raw_len = char_idx;
553 /* Apply TRANS to the buffer in PSTR. */
556 re_string_translate_buffer (pstr)
559 int buf_idx, end_idx;
560 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
564 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
565 pstr->mbs[buf_idx] = pstr->trans[ch];
568 pstr->valid_len = buf_idx;
569 pstr->valid_raw_len = buf_idx;
572 /* This function re-construct the buffers.
573 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
574 convert to upper case in case of REG_ICASE, apply translation. */
577 re_string_reconstruct (pstr, idx, eflags)
581 int offset = idx - pstr->raw_mbs_idx;
582 if (BE (offset < 0, 0))
585 #ifdef RE_ENABLE_I18N
586 if (pstr->mb_cur_max > 1)
587 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
588 #endif /* RE_ENABLE_I18N */
589 pstr->len = pstr->raw_len;
590 pstr->stop = pstr->raw_stop;
592 pstr->raw_mbs_idx = 0;
593 pstr->valid_raw_len = 0;
594 pstr->offsets_needed = 0;
595 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
596 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
597 if (!pstr->mbs_allocated)
598 pstr->mbs = (unsigned char *) pstr->raw_mbs;
602 if (BE (offset != 0, 1))
604 /* Are the characters which are already checked remain? */
605 if (BE (offset < pstr->valid_raw_len, 1)
606 #ifdef RE_ENABLE_I18N
607 /* Handling this would enlarge the code too much.
608 Accept a slowdown in that case. */
609 && pstr->offsets_needed == 0
613 /* Yes, move them to the front of the buffer. */
614 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
615 #ifdef RE_ENABLE_I18N
616 if (pstr->mb_cur_max > 1)
617 memmove (pstr->wcs, pstr->wcs + offset,
618 (pstr->valid_len - offset) * sizeof (wint_t));
619 #endif /* RE_ENABLE_I18N */
620 if (BE (pstr->mbs_allocated, 0))
621 memmove (pstr->mbs, pstr->mbs + offset,
622 pstr->valid_len - offset);
623 pstr->valid_len -= offset;
624 pstr->valid_raw_len -= offset;
626 assert (pstr->valid_len > 0);
631 /* No, skip all characters until IDX. */
632 #ifdef RE_ENABLE_I18N
633 if (BE (pstr->offsets_needed, 0))
635 pstr->len = pstr->raw_len - idx + offset;
636 pstr->stop = pstr->raw_stop - idx + offset;
637 pstr->offsets_needed = 0;
641 pstr->valid_raw_len = 0;
642 #ifdef RE_ENABLE_I18N
643 if (pstr->mb_cur_max > 1)
650 const unsigned char *raw, *p, *q, *end;
652 /* Special case UTF-8. Multi-byte chars start with any
653 byte other than 0x80 - 0xbf. */
654 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
655 end = raw + (offset - pstr->mb_cur_max);
656 for (p = raw + offset - 1; p >= end; --p)
657 if ((*p & 0xc0) != 0x80)
661 int mlen = raw + pstr->len - p;
662 unsigned char buf[6];
665 if (BE (pstr->trans != NULL, 0))
667 int i = mlen < 6 ? mlen : 6;
669 buf[i] = pstr->trans[p[i]];
672 /* XXX Don't use mbrtowc, we know which conversion
673 to use (UTF-8 -> UCS4). */
674 memset (&cur_state, 0, sizeof (cur_state));
675 mlen = (mbrtowc (&wc2, (const char *) p, mlen,
677 - (raw + offset - p));
680 memset (&pstr->cur_state, '\0',
682 pstr->valid_len = mlen;
690 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
691 if (BE (pstr->valid_len, 0))
693 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
694 pstr->wcs[wcs_idx] = WEOF;
695 if (pstr->mbs_allocated)
696 memset (pstr->mbs, 255, pstr->valid_len);
698 pstr->valid_raw_len = pstr->valid_len;
699 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
700 && IS_WIDE_WORD_CHAR (wc))
702 : ((IS_WIDE_NEWLINE (wc)
703 && pstr->newline_anchor)
704 ? CONTEXT_NEWLINE : 0));
707 #endif /* RE_ENABLE_I18N */
709 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
712 pstr->tip_context = (bitset_contain (pstr->word_char, c)
714 : ((IS_NEWLINE (c) && pstr->newline_anchor)
715 ? CONTEXT_NEWLINE : 0));
718 if (!BE (pstr->mbs_allocated, 0))
721 pstr->raw_mbs_idx = idx;
723 pstr->stop -= offset;
725 /* Then build the buffers. */
726 #ifdef RE_ENABLE_I18N
727 if (pstr->mb_cur_max > 1)
731 int ret = build_wcs_upper_buffer (pstr);
732 if (BE (ret != REG_NOERROR, 0))
736 build_wcs_buffer (pstr);
739 #endif /* RE_ENABLE_I18N */
740 if (BE (pstr->mbs_allocated, 0))
743 build_upper_buffer (pstr);
744 else if (pstr->trans != NULL)
745 re_string_translate_buffer (pstr);
748 pstr->valid_len = pstr->len;
755 re_string_peek_byte_case (pstr, idx)
756 const re_string_t *pstr;
761 /* Handle the common (easiest) cases first. */
762 if (BE (!pstr->mbs_allocated, 1))
763 return re_string_peek_byte (pstr, idx);
765 #ifdef RE_ENABLE_I18N
766 if (pstr->mb_cur_max > 1
767 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
768 return re_string_peek_byte (pstr, idx);
771 off = pstr->cur_idx + idx;
772 #ifdef RE_ENABLE_I18N
773 if (pstr->offsets_needed)
774 off = pstr->offsets[off];
777 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
779 #ifdef RE_ENABLE_I18N
780 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
781 this function returns CAPITAL LETTER I instead of first byte of
782 DOTLESS SMALL LETTER I. The latter would confuse the parser,
783 since peek_byte_case doesn't advance cur_idx in any way. */
784 if (pstr->offsets_needed && !isascii (ch))
785 return re_string_peek_byte (pstr, idx);
792 re_string_fetch_byte_case (pstr)
795 if (BE (!pstr->mbs_allocated, 1))
796 return re_string_fetch_byte (pstr);
798 #ifdef RE_ENABLE_I18N
799 if (pstr->offsets_needed)
803 /* For tr_TR.UTF-8 [[:islower:]] there is
804 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
805 in that case the whole multi-byte character and return
806 the original letter. On the other side, with
807 [[: DOTLESS SMALL LETTER I return [[:I, as doing
808 anything else would complicate things too much. */
810 if (!re_string_first_byte (pstr, pstr->cur_idx))
811 return re_string_fetch_byte (pstr);
813 off = pstr->offsets[pstr->cur_idx];
814 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
817 return re_string_fetch_byte (pstr);
819 re_string_skip_bytes (pstr,
820 re_string_char_size_at (pstr, pstr->cur_idx));
825 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
829 re_string_destruct (pstr)
832 #ifdef RE_ENABLE_I18N
834 re_free (pstr->offsets);
835 #endif /* RE_ENABLE_I18N */
836 if (pstr->mbs_allocated)
840 /* Return the context at IDX in INPUT. */
843 re_string_context_at (input, idx, eflags)
844 const re_string_t *input;
849 /* In this case, we use the value stored in input->tip_context,
850 since we can't know the character in input->mbs[-1] here. */
851 return input->tip_context;
852 if (BE (idx == input->len, 0))
853 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
854 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
855 #ifdef RE_ENABLE_I18N
856 if (input->mb_cur_max > 1)
860 while(input->wcs[wc_idx] == WEOF)
863 /* It must not happen. */
864 assert (wc_idx >= 0);
868 return input->tip_context;
870 wc = input->wcs[wc_idx];
871 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
873 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
874 ? CONTEXT_NEWLINE : 0);
879 c = re_string_byte_at (input, idx);
880 if (bitset_contain (input->word_char, c))
882 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
886 /* Functions for set operation. */
889 re_node_set_alloc (set, size)
895 set->elems = re_malloc (int, size);
896 if (BE (set->elems == NULL, 0))
902 re_node_set_init_1 (set, elem)
908 set->elems = re_malloc (int, 1);
909 if (BE (set->elems == NULL, 0))
911 set->alloc = set->nelem = 0;
914 set->elems[0] = elem;
919 re_node_set_init_2 (set, elem1, elem2)
924 set->elems = re_malloc (int, 2);
925 if (BE (set->elems == NULL, 0))
930 set->elems[0] = elem1;
937 set->elems[0] = elem1;
938 set->elems[1] = elem2;
942 set->elems[0] = elem2;
943 set->elems[1] = elem1;
950 re_node_set_init_copy (dest, src)
952 const re_node_set *src;
954 dest->nelem = src->nelem;
957 dest->alloc = dest->nelem;
958 dest->elems = re_malloc (int, dest->alloc);
959 if (BE (dest->elems == NULL, 0))
961 dest->alloc = dest->nelem = 0;
964 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
967 re_node_set_init_empty (dest);
971 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
972 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
973 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
976 re_node_set_add_intersect (dest, src1, src2)
978 const re_node_set *src1, *src2;
980 int i1, i2, is, id, delta, sbase;
981 if (src1->nelem == 0 || src2->nelem == 0)
984 /* We need dest->nelem + 2 * elems_in_intersection; this is a
985 conservative estimate. */
986 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
988 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
989 int *new_elems = re_realloc (dest->elems, int, new_alloc);
990 if (BE (new_elems == NULL, 0))
992 dest->elems = new_elems;
993 dest->alloc = new_alloc;
996 /* Find the items in the intersection of SRC1 and SRC2, and copy
997 into the top of DEST those that are not already in DEST itself. */
998 sbase = dest->nelem + src1->nelem + src2->nelem;
999 i1 = src1->nelem - 1;
1000 i2 = src2->nelem - 1;
1001 id = dest->nelem - 1;
1004 if (src1->elems[i1] == src2->elems[i2])
1006 /* Try to find the item in DEST. Maybe we could binary search? */
1007 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1010 if (id < 0 || dest->elems[id] != src1->elems[i1])
1011 dest->elems[--sbase] = src1->elems[i1];
1013 if (--i1 < 0 || --i2 < 0)
1017 /* Lower the highest of the two items. */
1018 else if (src1->elems[i1] < src2->elems[i2])
1030 id = dest->nelem - 1;
1031 is = dest->nelem + src1->nelem + src2->nelem - 1;
1032 delta = is - sbase + 1;
1034 /* Now copy. When DELTA becomes zero, the remaining
1035 DEST elements are already in place; this is more or
1036 less the same loop that is in re_node_set_merge. */
1037 dest->nelem += delta;
1038 if (delta > 0 && id >= 0)
1041 if (dest->elems[is] > dest->elems[id])
1043 /* Copy from the top. */
1044 dest->elems[id + delta--] = dest->elems[is--];
1050 /* Slide from the bottom. */
1051 dest->elems[id + delta] = dest->elems[id];
1057 /* Copy remaining SRC elements. */
1058 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1063 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1064 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1066 static reg_errcode_t
1067 re_node_set_init_union (dest, src1, src2)
1069 const re_node_set *src1, *src2;
1072 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1074 dest->alloc = src1->nelem + src2->nelem;
1075 dest->elems = re_malloc (int, dest->alloc);
1076 if (BE (dest->elems == NULL, 0))
1081 if (src1 != NULL && src1->nelem > 0)
1082 return re_node_set_init_copy (dest, src1);
1083 else if (src2 != NULL && src2->nelem > 0)
1084 return re_node_set_init_copy (dest, src2);
1086 re_node_set_init_empty (dest);
1089 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1091 if (src1->elems[i1] > src2->elems[i2])
1093 dest->elems[id++] = src2->elems[i2++];
1096 if (src1->elems[i1] == src2->elems[i2])
1098 dest->elems[id++] = src1->elems[i1++];
1100 if (i1 < src1->nelem)
1102 memcpy (dest->elems + id, src1->elems + i1,
1103 (src1->nelem - i1) * sizeof (int));
1104 id += src1->nelem - i1;
1106 else if (i2 < src2->nelem)
1108 memcpy (dest->elems + id, src2->elems + i2,
1109 (src2->nelem - i2) * sizeof (int));
1110 id += src2->nelem - i2;
1116 /* Calculate the union set of the sets DEST and SRC. And store it to
1117 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1119 static reg_errcode_t
1120 re_node_set_merge (dest, src)
1122 const re_node_set *src;
1124 int is, id, sbase, delta;
1125 if (src == NULL || src->nelem == 0)
1127 if (dest->alloc < 2 * src->nelem + dest->nelem)
1129 int new_alloc = 2 * (src->nelem + dest->alloc);
1130 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1131 if (BE (new_buffer == NULL, 0))
1133 dest->elems = new_buffer;
1134 dest->alloc = new_alloc;
1137 if (BE (dest->nelem == 0, 0))
1139 dest->nelem = src->nelem;
1140 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1144 /* Copy into the top of DEST the items of SRC that are not
1145 found in DEST. Maybe we could binary search in DEST? */
1146 for (sbase = dest->nelem + 2 * src->nelem,
1147 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1149 if (dest->elems[id] == src->elems[is])
1151 else if (dest->elems[id] < src->elems[is])
1152 dest->elems[--sbase] = src->elems[is--];
1153 else /* if (dest->elems[id] > src->elems[is]) */
1159 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1161 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1164 id = dest->nelem - 1;
1165 is = dest->nelem + 2 * src->nelem - 1;
1166 delta = is - sbase + 1;
1170 /* Now copy. When DELTA becomes zero, the remaining
1171 DEST elements are already in place. */
1172 dest->nelem += delta;
1175 if (dest->elems[is] > dest->elems[id])
1177 /* Copy from the top. */
1178 dest->elems[id + delta--] = dest->elems[is--];
1184 /* Slide from the bottom. */
1185 dest->elems[id + delta] = dest->elems[id];
1188 /* Copy remaining SRC elements. */
1189 memcpy (dest->elems, dest->elems + sbase,
1190 delta * sizeof (int));
1199 /* Insert the new element ELEM to the re_node_set* SET.
1200 SET should not already have ELEM.
1201 return -1 if an error is occured, return 1 otherwise. */
1204 re_node_set_insert (set, elem)
1209 /* In case the set is empty. */
1210 if (set->alloc == 0)
1212 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1218 if (BE (set->nelem, 0) == 0)
1220 /* We already guaranteed above that set->alloc != 0. */
1221 set->elems[0] = elem;
1226 /* Realloc if we need. */
1227 if (set->alloc == set->nelem)
1230 set->alloc = set->alloc * 2;
1231 new_array = re_realloc (set->elems, int, set->alloc);
1232 if (BE (new_array == NULL, 0))
1234 set->elems = new_array;
1237 /* Move the elements which follows the new element. Test the
1238 first element separately to skip a check in the inner loop. */
1239 if (elem < set->elems[0])
1242 for (idx = set->nelem; idx > 0; idx--)
1243 set->elems[idx] = set->elems[idx - 1];
1247 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1248 set->elems[idx] = set->elems[idx - 1];
1251 /* Insert the new element. */
1252 set->elems[idx] = elem;
1257 /* Insert the new element ELEM to the re_node_set* SET.
1258 SET should not already have any element greater than or equal to ELEM.
1259 Return -1 if an error is occured, return 1 otherwise. */
1262 re_node_set_insert_last (set, elem)
1266 /* Realloc if we need. */
1267 if (set->alloc == set->nelem)
1270 set->alloc = (set->alloc + 1) * 2;
1271 new_array = re_realloc (set->elems, int, set->alloc);
1272 if (BE (new_array == NULL, 0))
1274 set->elems = new_array;
1277 /* Insert the new element. */
1278 set->elems[set->nelem++] = elem;
1282 /* Compare two node sets SET1 and SET2.
1283 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1286 re_node_set_compare (set1, set2)
1287 const re_node_set *set1, *set2;
1290 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1292 for (i = set1->nelem ; --i >= 0 ; )
1293 if (set1->elems[i] != set2->elems[i])
1298 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1301 re_node_set_contains (set, elem)
1302 const re_node_set *set;
1305 unsigned int idx, right, mid;
1306 if (set->nelem <= 0)
1309 /* Binary search the element. */
1311 right = set->nelem - 1;
1314 mid = (idx + right) / 2;
1315 if (set->elems[mid] < elem)
1320 return set->elems[idx] == elem ? idx + 1 : 0;
1324 re_node_set_remove_at (set, idx)
1328 if (idx < 0 || idx >= set->nelem)
1331 for (; idx < set->nelem; idx++)
1332 set->elems[idx] = set->elems[idx + 1];
1336 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1337 Or return -1, if an error will be occured. */
1340 re_dfa_add_node (dfa, token)
1344 int type = token.type;
1345 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1347 int new_nodes_alloc = dfa->nodes_alloc * 2;
1348 int *new_nexts, *new_indices;
1349 re_node_set *new_edests, *new_eclosures;
1351 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1353 if (BE (new_array == NULL, 0))
1355 dfa->nodes = new_array;
1356 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1357 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1358 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1359 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1360 if (BE (new_nexts == NULL || new_indices == NULL
1361 || new_edests == NULL || new_eclosures == NULL, 0))
1363 dfa->nexts = new_nexts;
1364 dfa->org_indices = new_indices;
1365 dfa->edests = new_edests;
1366 dfa->eclosures = new_eclosures;
1367 dfa->nodes_alloc = new_nodes_alloc;
1369 dfa->nodes[dfa->nodes_len] = token;
1370 dfa->nodes[dfa->nodes_len].constraint = 0;
1371 #ifdef RE_ENABLE_I18N
1372 dfa->nodes[dfa->nodes_len].accept_mb =
1373 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1375 dfa->nexts[dfa->nodes_len] = -1;
1376 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1377 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1378 return dfa->nodes_len++;
1381 static unsigned int inline
1382 calc_state_hash (nodes, context)
1383 const re_node_set *nodes;
1384 unsigned int context;
1386 unsigned int hash = nodes->nelem + context;
1388 for (i = 0 ; i < nodes->nelem ; i++)
1389 hash += nodes->elems[i];
1393 /* Search for the state whose node_set is equivalent to NODES.
1394 Return the pointer to the state, if we found it in the DFA.
1395 Otherwise create the new one and return it. In case of an error
1396 return NULL and set the error code in ERR.
1397 Note: - We assume NULL as the invalid state, then it is possible that
1398 return value is NULL and ERR is REG_NOERROR.
1399 - We never return non-NULL value in case of any errors, it is for
1402 static re_dfastate_t*
1403 re_acquire_state (err, dfa, nodes)
1406 const re_node_set *nodes;
1409 re_dfastate_t *new_state;
1410 struct re_state_table_entry *spot;
1412 if (BE (nodes->nelem == 0, 0))
1417 hash = calc_state_hash (nodes, 0);
1418 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1420 for (i = 0 ; i < spot->num ; i++)
1422 re_dfastate_t *state = spot->array[i];
1423 if (hash != state->hash)
1425 if (re_node_set_compare (&state->nodes, nodes))
1429 /* There are no appropriate state in the dfa, create the new one. */
1430 new_state = create_ci_newstate (dfa, nodes, hash);
1431 if (BE (new_state != NULL, 1))
1440 /* Search for the state whose node_set is equivalent to NODES and
1441 whose context is equivalent to CONTEXT.
1442 Return the pointer to the state, if we found it in the DFA.
1443 Otherwise create the new one and return it. In case of an error
1444 return NULL and set the error code in ERR.
1445 Note: - We assume NULL as the invalid state, then it is possible that
1446 return value is NULL and ERR is REG_NOERROR.
1447 - We never return non-NULL value in case of any errors, it is for
1450 static re_dfastate_t*
1451 re_acquire_state_context (err, dfa, nodes, context)
1454 const re_node_set *nodes;
1455 unsigned int context;
1458 re_dfastate_t *new_state;
1459 struct re_state_table_entry *spot;
1461 if (nodes->nelem == 0)
1466 hash = calc_state_hash (nodes, context);
1467 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1469 for (i = 0 ; i < spot->num ; i++)
1471 re_dfastate_t *state = spot->array[i];
1472 if (state->hash == hash
1473 && state->context == context
1474 && re_node_set_compare (state->entrance_nodes, nodes))
1477 /* There are no appropriate state in `dfa', create the new one. */
1478 new_state = create_cd_newstate (dfa, nodes, context, hash);
1479 if (BE (new_state != NULL, 1))
1488 /* Finish initialization of the new state NEWSTATE, and using its hash value
1489 HASH put in the appropriate bucket of DFA's state table. Return value
1490 indicates the error code if failed. */
1492 static reg_errcode_t
1493 register_state (dfa, newstate, hash)
1495 re_dfastate_t *newstate;
1498 struct re_state_table_entry *spot;
1502 newstate->hash = hash;
1503 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1504 if (BE (err != REG_NOERROR, 0))
1506 for (i = 0; i < newstate->nodes.nelem; i++)
1508 int elem = newstate->nodes.elems[i];
1509 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1510 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1513 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1514 if (BE (spot->alloc <= spot->num, 0))
1516 int new_alloc = 2 * spot->num + 2;
1517 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1519 if (BE (new_array == NULL, 0))
1521 spot->array = new_array;
1522 spot->alloc = new_alloc;
1524 spot->array[spot->num++] = newstate;
1528 /* Create the new state which is independ of contexts.
1529 Return the new state if succeeded, otherwise return NULL. */
1531 static re_dfastate_t *
1532 create_ci_newstate (dfa, nodes, hash)
1534 const re_node_set *nodes;
1539 re_dfastate_t *newstate;
1541 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1542 if (BE (newstate == NULL, 0))
1544 err = re_node_set_init_copy (&newstate->nodes, nodes);
1545 if (BE (err != REG_NOERROR, 0))
1551 newstate->entrance_nodes = &newstate->nodes;
1552 for (i = 0 ; i < nodes->nelem ; i++)
1554 re_token_t *node = dfa->nodes + nodes->elems[i];
1555 re_token_type_t type = node->type;
1556 if (type == CHARACTER && !node->constraint)
1558 #ifdef RE_ENABLE_I18N
1559 newstate->accept_mb |= node->accept_mb;
1560 #endif /* RE_ENABLE_I18N */
1562 /* If the state has the halt node, the state is a halt state. */
1563 if (type == END_OF_RE)
1565 else if (type == OP_BACK_REF)
1566 newstate->has_backref = 1;
1567 else if (type == ANCHOR || node->constraint)
1568 newstate->has_constraint = 1;
1570 err = register_state (dfa, newstate, hash);
1571 if (BE (err != REG_NOERROR, 0))
1573 free_state (newstate);
1579 /* Create the new state which is depend on the context CONTEXT.
1580 Return the new state if succeeded, otherwise return NULL. */
1582 static re_dfastate_t *
1583 create_cd_newstate (dfa, nodes, context, hash)
1585 const re_node_set *nodes;
1586 unsigned int context, hash;
1588 int i, nctx_nodes = 0;
1590 re_dfastate_t *newstate;
1592 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1593 if (BE (newstate == NULL, 0))
1595 err = re_node_set_init_copy (&newstate->nodes, nodes);
1596 if (BE (err != REG_NOERROR, 0))
1602 newstate->context = context;
1603 newstate->entrance_nodes = &newstate->nodes;
1605 for (i = 0 ; i < nodes->nelem ; i++)
1607 unsigned int constraint = 0;
1608 re_token_t *node = dfa->nodes + nodes->elems[i];
1609 re_token_type_t type = node->type;
1610 if (node->constraint)
1611 constraint = node->constraint;
1613 if (type == CHARACTER && !constraint)
1615 #ifdef RE_ENABLE_I18N
1616 newstate->accept_mb |= node->accept_mb;
1617 #endif /* RE_ENABLE_I18N */
1619 /* If the state has the halt node, the state is a halt state. */
1620 if (type == END_OF_RE)
1622 else if (type == OP_BACK_REF)
1623 newstate->has_backref = 1;
1624 else if (type == ANCHOR)
1625 constraint = node->opr.ctx_type;
1629 if (newstate->entrance_nodes == &newstate->nodes)
1631 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1632 if (BE (newstate->entrance_nodes == NULL, 0))
1634 free_state (newstate);
1637 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1639 newstate->has_constraint = 1;
1642 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1644 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1649 err = register_state (dfa, newstate, hash);
1650 if (BE (err != REG_NOERROR, 0))
1652 free_state (newstate);
1660 re_dfastate_t *state;
1662 re_node_set_free (&state->non_eps_nodes);
1663 re_node_set_free (&state->inveclosure);
1664 if (state->entrance_nodes != &state->nodes)
1666 re_node_set_free (state->entrance_nodes);
1667 re_free (state->entrance_nodes);
1669 re_node_set_free (&state->nodes);
1670 re_free (state->word_trtable);
1671 re_free (state->trtable);