X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;ds=sidebyside;f=lib%2Fstr-two-way.h;h=dbc2f889fbbefdc47624b976b1b7f49732ec9dc2;hb=55898ee1c7c12e63772c65e772577f10bb7adb31;hp=3f67741bb29077bb8d98fe225cade37e259ac4b4;hpb=b2e2010c7c902235b5efb5bd3c6529f61b093aa4;p=gnulib.git diff --git a/lib/str-two-way.h b/lib/str-two-way.h index 3f67741bb..dbc2f889f 100644 --- a/lib/str-two-way.h +++ b/lib/str-two-way.h @@ -95,26 +95,37 @@ A critical factorization has the property that the local period equals the global period. All strings have at least one critical factorization with the left half smaller than the global period. + And while some strings have more than one critical factorization, + it is provable that with an ordered alphabet, at least one of the + critical factorizations corresponds to a maximal suffix. Given an ordered alphabet, a critical factorization can be computed in linear time, with 2 * NEEDLE_LEN comparisons, by computing the - larger of two ordered maximal suffixes. The ordered maximal - suffixes are determined by lexicographic comparison of + shorter of two ordered maximal suffixes. The ordered maximal + suffixes are determined by lexicographic comparison while tracking periodicity. */ static size_t critical_factorization (const unsigned char *needle, size_t needle_len, size_t *period) { - /* Index of last byte of left half, or SIZE_MAX. */ + /* Index of last byte of left half. */ size_t max_suffix, max_suffix_rev; size_t j; /* Index into NEEDLE for current candidate suffix. */ size_t k; /* Offset into current period. */ size_t p; /* Intermediate period. */ unsigned char a, b; /* Current comparison bytes. */ + /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered + out 0-length needles. */ + if (needle_len < 3) + { + *period = 1; + return needle_len - 1; + } + /* Invariants: - 0 <= j < NEEDLE_LEN - 1 - -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) + 1 <= j < NEEDLE_LEN - 1 + 0 <= max_suffix{,_rev} < j min(max_suffix, max_suffix_rev) < global period of NEEDLE 1 <= p <= global period of NEEDLE p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] @@ -122,9 +133,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, */ /* Perform lexicographic search. */ - max_suffix = SIZE_MAX; - j = 0; - k = p = 1; + max_suffix = 0; + j = k = p = 1; while (j + k < needle_len) { a = CANON_ELEMENT (needle[j + k]); @@ -157,9 +167,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, *period = p; /* Perform reverse lexicographic search. */ - max_suffix_rev = SIZE_MAX; - j = 0; - k = p = 1; + max_suffix_rev = 0; + j = k = p = 1; while (j + k < needle_len) { a = CANON_ELEMENT (needle[j + k]); @@ -190,8 +199,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len, } } - /* Choose the longer suffix. Return the first byte of the right - half, rather than the last byte of the left half. */ + /* Choose the shorter suffix. Return the index of the first byte of + the right half, rather than the last byte of the left half. + + For some examples, 'banana' has two critical factorizations, both + exposed by the two lexicographic extreme suffixes of 'anana' and + 'nana', where both suffixes have a period of 2. On the other + hand, with 'aab' and 'bba', both strings have a single critical + factorization of the last byte, with the suffix having a period + of 1. While the maximal lexicographic suffix of 'aab' is 'b', + the maximal lexicographic suffix of 'bba' is 'ba', which is not a + critical factorization. Conversely, the maximal reverse + lexicographic suffix of 'a' works for 'bba', but not 'ab' for + 'aab'. The shorter suffix of the two will always be a critical + factorization. */ if (max_suffix_rev + 1 < max_suffix + 1) return max_suffix + 1; *period = p; @@ -226,9 +247,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, first. */ if (CMP_FUNC (needle, needle + period, suffix) == 0) { - /* Entire needle is periodic; a mismatch can only advance by the - period, so use memory to avoid rescanning known occurrences - of the period. */ + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ size_t memory = 0; j = 0; while (AVAILABLE (haystack, haystack_len, j, needle_len)) @@ -330,9 +351,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, first. */ if (CMP_FUNC (needle, needle + period, suffix) == 0) { - /* Entire needle is periodic; a mismatch can only advance by the - period, so use memory to avoid rescanning known occurrences - of the period. */ + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ size_t memory = 0; size_t shift; j = 0;