X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fstr-two-way.h;h=7dcb38721139e18e6ac5c22a8ba6ca958b8bb502;hb=56d6664559f449af25f0d331457b014b02324d65;hp=b0338a70a1a9e02d80a17d007fcfa4c6ff4dd02d;hpb=a37217922a7b85ddaab2d802b275905f6e9310ac;p=gnulib.git diff --git a/lib/str-two-way.h b/lib/str-two-way.h index b0338a70a..7dcb38721 100644 --- a/lib/str-two-way.h +++ b/lib/str-two-way.h @@ -1,5 +1,5 @@ /* Byte-wise substring search, using the Two-Way algorithm. - Copyright (C) 2008 Free Software Foundation, Inc. + Copyright (C) 2008-2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Written by Eric Blake , 2008. @@ -21,21 +21,21 @@ , and define: RESULT_TYPE A macro that expands to the return type. AVAILABLE(h, h_l, j, n_l) - A macro that returns nonzero if there are - at least N_L bytes left starting at H[J]. - H is 'unsigned char *', H_L, J, and N_L - are 'size_t'; H_L is an lvalue. For - NUL-terminated searches, H_L can be - modified each iteration to avoid having - to compute the end of H up front. + A macro that returns nonzero if there are + at least N_L bytes left starting at H[J]. + H is 'unsigned char *', H_L, J, and N_L + are 'size_t'; H_L is an lvalue. For + NUL-terminated searches, H_L can be + modified each iteration to avoid having + to compute the end of H up front. For case-insensitivity, you may optionally define: CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L - characters of P1 and P2 are equal. + characters of P1 and P2 are equal. CANON_ELEMENT(c) A macro that canonicalizes an element right after - it has been fetched from one of the two strings. - The argument is an 'unsigned char'; the result - must be an 'unsigned char' as well. + it has been fetched from one of the two strings. + The argument is an 'unsigned char'; the result + must be an 'unsigned char' as well. This file undefines the macros documented above, and defines LONG_NEEDLE_THRESHOLD. @@ -44,14 +44,15 @@ #include #include -/* We use the Two-Way string matching algorithm, which guarantees - linear complexity with constant space. Additionally, for long - needles, we also use a bad character shift table similar to the - Boyer-Moore algorithm to achieve improved (potentially sub-linear) - performance. +/* We use the Two-Way string matching algorithm (also known as + Chrochemore-Perrin), which guarantees linear complexity with + constant space. Additionally, for long needles, we also use a bad + character shift table similar to the Boyer-Moore algorithm to + achieve improved (potentially sub-linear) performance. - See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm + See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, + http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, + http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf */ /* Point at which computing a bad-byte shift table is likely to be @@ -95,15 +96,18 @@ A critical factorization has the property that the local period equals the global period. All strings have at least one critical factorization with the left half smaller than the global period. + And while some strings have more than one critical factorization, + it is provable that with an ordered alphabet, at least one of the + critical factorizations corresponds to a maximal suffix. Given an ordered alphabet, a critical factorization can be computed in linear time, with 2 * NEEDLE_LEN comparisons, by computing the - larger of two ordered maximal suffixes. The ordered maximal - suffixes are determined by lexicographic comparison of + shorter of two ordered maximal suffixes. The ordered maximal + suffixes are determined by lexicographic comparison while tracking periodicity. */ static size_t critical_factorization (const unsigned char *needle, size_t needle_len, - size_t *period) + size_t *period) { /* Index of last byte of left half, or SIZE_MAX. */ size_t max_suffix, max_suffix_rev; @@ -112,6 +116,14 @@ critical_factorization (const unsigned char *needle, size_t needle_len, size_t p; /* Intermediate period. */ unsigned char a, b; /* Current comparison bytes. */ + /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered + out 0-length needles. */ + if (needle_len < 3) + { + *period = 1; + return needle_len - 1; + } + /* Invariants: 0 <= j < NEEDLE_LEN - 1 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) @@ -130,29 +142,29 @@ critical_factorization (const unsigned char *needle, size_t needle_len, a = CANON_ELEMENT (needle[j + k]); b = CANON_ELEMENT (needle[max_suffix + k]); if (a < b) - { - /* Suffix is smaller, period is entire prefix so far. */ - j += k; - k = 1; - p = j - max_suffix; - } + { + /* Suffix is smaller, period is entire prefix so far. */ + j += k; + k = 1; + p = j - max_suffix; + } else if (a == b) - { - /* Advance through repetition of the current period. */ - if (k != p) - ++k; - else - { - j += p; - k = 1; - } - } + { + /* Advance through repetition of the current period. */ + if (k != p) + ++k; + else + { + j += p; + k = 1; + } + } else /* b < a */ - { - /* Suffix is larger, start over from current location. */ - max_suffix = j++; - k = p = 1; - } + { + /* Suffix is larger, start over from current location. */ + max_suffix = j++; + k = p = 1; + } } *period = p; @@ -165,33 +177,45 @@ critical_factorization (const unsigned char *needle, size_t needle_len, a = CANON_ELEMENT (needle[j + k]); b = CANON_ELEMENT (needle[max_suffix_rev + k]); if (b < a) - { - /* Suffix is smaller, period is entire prefix so far. */ - j += k; - k = 1; - p = j - max_suffix_rev; - } + { + /* Suffix is smaller, period is entire prefix so far. */ + j += k; + k = 1; + p = j - max_suffix_rev; + } else if (a == b) - { - /* Advance through repetition of the current period. */ - if (k != p) - ++k; - else - { - j += p; - k = 1; - } - } + { + /* Advance through repetition of the current period. */ + if (k != p) + ++k; + else + { + j += p; + k = 1; + } + } else /* a < b */ - { - /* Suffix is larger, start over from current location. */ - max_suffix_rev = j++; - k = p = 1; - } + { + /* Suffix is larger, start over from current location. */ + max_suffix_rev = j++; + k = p = 1; + } } - /* Choose the longer suffix. Return the first byte of the right - half, rather than the last byte of the left half. */ + /* Choose the shorter suffix. Return the index of the first byte of + the right half, rather than the last byte of the left half. + + For some examples, 'banana' has two critical factorizations, both + exposed by the two lexicographic extreme suffixes of 'anana' and + 'nana', where both suffixes have a period of 2. On the other + hand, with 'aab' and 'bba', both strings have a single critical + factorization of the last byte, with the suffix having a period + of 1. While the maximal lexicographic suffix of 'aab' is 'b', + the maximal lexicographic suffix of 'bba' is 'ba', which is not a + critical factorization. Conversely, the maximal reverse + lexicographic suffix of 'a' works for 'bba', but not 'ab' for + 'aab'. The shorter suffix of the two will always be a critical + factorization. */ if (max_suffix_rev + 1 < max_suffix + 1) return max_suffix + 1; *period = p; @@ -210,7 +234,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ static RETURN_TYPE two_way_short_needle (const unsigned char *haystack, size_t haystack_len, - const unsigned char *needle, size_t needle_len) + const unsigned char *needle, size_t needle_len) { size_t i; /* Index into current byte of NEEDLE. */ size_t j; /* Index into current window of HAYSTACK. */ @@ -226,66 +250,66 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, first. */ if (CMP_FUNC (needle, needle + period, suffix) == 0) { - /* Entire needle is periodic; a mismatch can only advance by the - period, so use memory to avoid rescanning known occurrences - of the period. */ + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ size_t memory = 0; j = 0; while (AVAILABLE (haystack, haystack_len, j, needle_len)) - { - /* Scan for matches in right half. */ - i = MAX (suffix, memory); - while (i < needle_len && (CANON_ELEMENT (needle[i]) - == CANON_ELEMENT (haystack[i + j]))) - ++i; - if (needle_len <= i) - { - /* Scan for matches in left half. */ - i = suffix - 1; - while (memory < i + 1 && (CANON_ELEMENT (needle[i]) - == CANON_ELEMENT (haystack[i + j]))) - --i; - if (i + 1 < memory + 1) - return (RETURN_TYPE) (haystack + j); - /* No match, so remember how many repetitions of period - on the right half were scanned. */ - j += period; - memory = needle_len - period; - } - else - { - j += i - suffix + 1; - memory = 0; - } - } + { + /* Scan for matches in right half. */ + i = MAX (suffix, memory); + while (i < needle_len && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + ++i; + if (needle_len <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (memory < i + 1 && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + --i; + if (i + 1 < memory + 1) + return (RETURN_TYPE) (haystack + j); + /* No match, so remember how many repetitions of period + on the right half were scanned. */ + j += period; + memory = needle_len - period; + } + else + { + j += i - suffix + 1; + memory = 0; + } + } } else { /* The two halves of needle are distinct; no extra memory is - required, and any mismatch results in a maximal shift. */ + required, and any mismatch results in a maximal shift. */ period = MAX (suffix, needle_len - suffix) + 1; j = 0; while (AVAILABLE (haystack, haystack_len, j, needle_len)) - { - /* Scan for matches in right half. */ - i = suffix; - while (i < needle_len && (CANON_ELEMENT (needle[i]) - == CANON_ELEMENT (haystack[i + j]))) - ++i; - if (needle_len <= i) - { - /* Scan for matches in left half. */ - i = suffix - 1; - while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) - == CANON_ELEMENT (haystack[i + j]))) - --i; - if (i == SIZE_MAX) - return (RETURN_TYPE) (haystack + j); - j += period; - } - else - j += i - suffix + 1; - } + { + /* Scan for matches in right half. */ + i = suffix; + while (i < needle_len && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + ++i; + if (needle_len <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + --i; + if (i == SIZE_MAX) + return (RETURN_TYPE) (haystack + j); + j += period; + } + else + j += i - suffix + 1; + } } return NULL; } @@ -304,7 +328,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, sublinear performance is not possible. */ static RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, - const unsigned char *needle, size_t needle_len) + const unsigned char *needle, size_t needle_len) { size_t i; /* Index into current byte of NEEDLE. */ size_t j; /* Index into current window of HAYSTACK. */ @@ -330,94 +354,94 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, first. */ if (CMP_FUNC (needle, needle + period, suffix) == 0) { - /* Entire needle is periodic; a mismatch can only advance by the - period, so use memory to avoid rescanning known occurrences - of the period. */ + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ size_t memory = 0; size_t shift; j = 0; while (AVAILABLE (haystack, haystack_len, j, needle_len)) - { - /* Check the last byte first; if it does not match, then - shift to the next possible match location. */ - shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; - if (0 < shift) - { - if (memory && shift < period) - { - /* Since needle is periodic, but the last period has - a byte out of place, there can be no match until - after the mismatch. */ - shift = needle_len - period; - memory = 0; - } - j += shift; - continue; - } - /* Scan for matches in right half. The last byte has - already been matched, by virtue of the shift table. */ - i = MAX (suffix, memory); - while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) - == CANON_ELEMENT (haystack[i + j]))) - ++i; - if (needle_len - 1 <= i) - { - /* Scan for matches in left half. */ - i = suffix - 1; - while (memory < i + 1 && (CANON_ELEMENT (needle[i]) - == CANON_ELEMENT (haystack[i + j]))) - --i; - if (i + 1 < memory + 1) - return (RETURN_TYPE) (haystack + j); - /* No match, so remember how many repetitions of period - on the right half were scanned. */ - j += period; - memory = needle_len - period; - } - else - { - j += i - suffix + 1; - memory = 0; - } - } + { + /* Check the last byte first; if it does not match, then + shift to the next possible match location. */ + shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; + if (0 < shift) + { + if (memory && shift < period) + { + /* Since needle is periodic, but the last period has + a byte out of place, there can be no match until + after the mismatch. */ + shift = needle_len - period; + } + memory = 0; + j += shift; + continue; + } + /* Scan for matches in right half. The last byte has + already been matched, by virtue of the shift table. */ + i = MAX (suffix, memory); + while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + ++i; + if (needle_len - 1 <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (memory < i + 1 && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + --i; + if (i + 1 < memory + 1) + return (RETURN_TYPE) (haystack + j); + /* No match, so remember how many repetitions of period + on the right half were scanned. */ + j += period; + memory = needle_len - period; + } + else + { + j += i - suffix + 1; + memory = 0; + } + } } else { /* The two halves of needle are distinct; no extra memory is - required, and any mismatch results in a maximal shift. */ + required, and any mismatch results in a maximal shift. */ size_t shift; period = MAX (suffix, needle_len - suffix) + 1; j = 0; while (AVAILABLE (haystack, haystack_len, j, needle_len)) - { - /* Check the last byte first; if it does not match, then - shift to the next possible match location. */ - shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; - if (0 < shift) - { - j += shift; - continue; - } - /* Scan for matches in right half. The last byte has - already been matched, by virtue of the shift table. */ - i = suffix; - while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) - == CANON_ELEMENT (haystack[i + j]))) - ++i; - if (needle_len - 1 <= i) - { - /* Scan for matches in left half. */ - i = suffix - 1; - while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) - == CANON_ELEMENT (haystack[i + j]))) - --i; - if (i == SIZE_MAX) - return (RETURN_TYPE) (haystack + j); - j += period; - } - else - j += i - suffix + 1; - } + { + /* Check the last byte first; if it does not match, then + shift to the next possible match location. */ + shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; + if (0 < shift) + { + j += shift; + continue; + } + /* Scan for matches in right half. The last byte has + already been matched, by virtue of the shift table. */ + i = suffix; + while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + ++i; + if (needle_len - 1 <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + --i; + if (i == SIZE_MAX) + return (RETURN_TYPE) (haystack + j); + j += period; + } + else + j += i - suffix + 1; + } } return NULL; }