X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fstr-two-way.h;h=7dcb38721139e18e6ac5c22a8ba6ca958b8bb502;hb=d4d84c1d916f5bff3bb72be92e9d1383a25077c1;hp=28e4265ad7347f7b5e76ee9cc973655f4f7c73aa;hpb=441aa3044f43e5572f58c354f01e6bc070acd5c7;p=gnulib.git diff --git a/lib/str-two-way.h b/lib/str-two-way.h index 28e4265ad..7dcb38721 100644 --- a/lib/str-two-way.h +++ b/lib/str-two-way.h @@ -1,5 +1,5 @@ /* Byte-wise substring search, using the Two-Way algorithm. - Copyright (C) 2008 Free Software Foundation, Inc. + Copyright (C) 2008-2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Written by Eric Blake , 2008. @@ -44,14 +44,15 @@ #include #include -/* We use the Two-Way string matching algorithm, which guarantees - linear complexity with constant space. Additionally, for long - needles, we also use a bad character shift table similar to the - Boyer-Moore algorithm to achieve improved (potentially sub-linear) - performance. +/* We use the Two-Way string matching algorithm (also known as + Chrochemore-Perrin), which guarantees linear complexity with + constant space. Additionally, for long needles, we also use a bad + character shift table similar to the Boyer-Moore algorithm to + achieve improved (potentially sub-linear) performance. - See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm + See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, + http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, + http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf */ /* Point at which computing a bad-byte shift table is likely to be @@ -95,11 +96,14 @@ A critical factorization has the property that the local period equals the global period. All strings have at least one critical factorization with the left half smaller than the global period. + And while some strings have more than one critical factorization, + it is provable that with an ordered alphabet, at least one of the + critical factorizations corresponds to a maximal suffix. Given an ordered alphabet, a critical factorization can be computed in linear time, with 2 * NEEDLE_LEN comparisons, by computing the - larger of two ordered maximal suffixes. The ordered maximal - suffixes are determined by lexicographic comparison of + shorter of two ordered maximal suffixes. The ordered maximal + suffixes are determined by lexicographic comparison while tracking periodicity. */ static size_t critical_factorization (const unsigned char *needle, size_t needle_len, @@ -112,6 +116,14 @@ critical_factorization (const unsigned char *needle, size_t needle_len, size_t p; /* Intermediate period. */ unsigned char a, b; /* Current comparison bytes. */ + /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered + out 0-length needles. */ + if (needle_len < 3) + { + *period = 1; + return needle_len - 1; + } + /* Invariants: 0 <= j < NEEDLE_LEN - 1 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) @@ -190,8 +202,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len, } } - /* Choose the longer suffix. Return the first byte of the right - half, rather than the last byte of the left half. */ + /* Choose the shorter suffix. Return the index of the first byte of + the right half, rather than the last byte of the left half. + + For some examples, 'banana' has two critical factorizations, both + exposed by the two lexicographic extreme suffixes of 'anana' and + 'nana', where both suffixes have a period of 2. On the other + hand, with 'aab' and 'bba', both strings have a single critical + factorization of the last byte, with the suffix having a period + of 1. While the maximal lexicographic suffix of 'aab' is 'b', + the maximal lexicographic suffix of 'bba' is 'ba', which is not a + critical factorization. Conversely, the maximal reverse + lexicographic suffix of 'a' works for 'bba', but not 'ab' for + 'aab'. The shorter suffix of the two will always be a critical + factorization. */ if (max_suffix_rev + 1 < max_suffix + 1) return max_suffix + 1; *period = p; @@ -226,9 +250,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, first. */ if (CMP_FUNC (needle, needle + period, suffix) == 0) { - /* Entire needle is periodic; a mismatch can only advance by the - period, so use memory to avoid rescanning known occurrences - of the period. */ + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ size_t memory = 0; j = 0; while (AVAILABLE (haystack, haystack_len, j, needle_len)) @@ -330,9 +354,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, first. */ if (CMP_FUNC (needle, needle + period, suffix) == 0) { - /* Entire needle is periodic; a mismatch can only advance by the - period, so use memory to avoid rescanning known occurrences - of the period. */ + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ size_t memory = 0; size_t shift; j = 0; @@ -349,8 +373,8 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, a byte out of place, there can be no match until after the mismatch. */ shift = needle_len - period; - memory = 0; } + memory = 0; j += shift; continue; }