From 7855e1e3a12ea5e158d51b9ab7f38006ac5fc028 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Mon, 31 Dec 2007 11:57:16 +0100 Subject: [PATCH] Unify 5 copies of the KMP code. --- ChangeLog | 24 +++++++++ lib/c-strcasestr.c | 126 ++----------------------------------------- lib/c-strstr.c | 125 ++----------------------------------------- lib/mbscasestr.c | 126 ++----------------------------------------- lib/mbsstr.c | 125 ++----------------------------------------- lib/str-kmp.h | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++ lib/strcasestr.c | 126 ++----------------------------------------- modules/c-strcasestr | 1 + modules/c-strstr | 1 + modules/mbscasestr | 1 + modules/mbsstr | 1 + modules/strcasestr | 1 + 12 files changed, 199 insertions(+), 606 deletions(-) create mode 100644 lib/str-kmp.h diff --git a/ChangeLog b/ChangeLog index 22b0d540e..61bafea26 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,29 @@ 2007-12-30 Bruno Haible + Unify 5 copies of the KMP code. + * lib/str-kmp.h: New file. + * lib/c-strcasestr.c: Include str-kmp.h. + (knuth_morris_pratt): Remove function. + (c_strcasestr): Update. + * lib/c-strstr.c: Include str-kmp.h. + (knuth_morris_pratt): Remove function. + (c_strcasestr): Update. + * lib/mbscasestr.c: Include str-kmp.h. + (knuth_morris_pratt_unibyte): Remove function. + * lib/mbsstr.c: Include str-kmp.h. + (knuth_morris_pratt_unibyte): Remove function. + * lib/strcasestr.c: Include str-kmp.h. + (knuth_morris_pratt): Remove function. + (strcasestr): Update. + * modules/c-strcasestr (Files): Add lib/str-kmp.h. + * modules/c-strstr (Files): Likewise. + * modules/mbscasestr (Files): Likewise. + * modules/mbsstr (Files): Likewise. + * modules/strcasestr (Files): Likewise. + Suggested by Paul Eggert. + +2007-12-30 Bruno Haible + * lib/xmalloca.c (xmmalloca): Don't define if HAVE_ALLOCA is not defined. diff --git a/lib/c-strcasestr.c b/lib/c-strcasestr.c index 36b2a9f5c..428be7001 100644 --- a/lib/c-strcasestr.c +++ b/lib/c-strcasestr.c @@ -27,127 +27,9 @@ #include "malloca.h" #include "c-ctype.h" -/* Knuth-Morris-Pratt algorithm. - See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm - Return a boolean indicating success. */ -static bool -knuth_morris_pratt (const char *haystack, const char *needle, - const char **resultp) -{ - size_t m = strlen (needle); - - /* Allocate the table. */ - size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); - if (table == NULL) - return false; - /* Fill the table. - For 0 < i < m: - 0 < table[i] <= i is defined such that - forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], - and table[i] is as large as possible with this property. - This implies: - 1) For 0 < i < m: - If table[i] < i, - needle[table[i]..i-1] = needle[0..i-1-table[i]]. - 2) For 0 < i < m: - rhaystack[0..i-1] == needle[0..i-1] - and exists h, i <= h < m: rhaystack[h] != needle[h] - implies - forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. - table[0] remains uninitialized. */ - { - size_t i, j; - - /* i = 1: Nothing to verify for x = 0. */ - table[1] = 1; - j = 0; - - for (i = 2; i < m; i++) - { - /* Here: j = i-1 - table[i-1]. - The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold - for x < table[i-1], by induction. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - unsigned char b = c_tolower ((unsigned char) needle[i - 1]); - - for (;;) - { - /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] - is known to hold for x < i-1-j. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - if (b == c_tolower ((unsigned char) needle[j])) - { - /* Set table[i] := i-1-j. */ - table[i] = i - ++j; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for x = i-1-j, because - needle[i-1] != needle[j] = needle[i-1-x]. */ - if (j == 0) - { - /* The inequality holds for all possible x. */ - table[i] = i; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for i-1-j < x < i-1-j+table[j], because for these x: - needle[x..i-2] - = needle[x-(i-1-j)..j-1] - != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) - = needle[0..i-2-x], - hence needle[x..i-1] != needle[0..i-1-x]. - Furthermore - needle[i-1-j+table[j]..i-2] - = needle[table[j]..j-1] - = needle[0..j-1-table[j]] (by definition of table[j]). */ - j = j - table[j]; - } - /* Here: j = i - table[i]. */ - } - } - - /* Search, using the table to accelerate the processing. */ - { - size_t j; - const char *rhaystack; - const char *phaystack; - - *resultp = NULL; - j = 0; - rhaystack = haystack; - phaystack = haystack; - /* Invariant: phaystack = rhaystack + j. */ - while (*phaystack != '\0') - if (c_tolower ((unsigned char) needle[j]) - == c_tolower ((unsigned char) *phaystack)) - { - j++; - phaystack++; - if (j == m) - { - /* The entire needle has been found. */ - *resultp = rhaystack; - break; - } - } - else if (j > 0) - { - /* Found a match of needle[0..j-1], mismatch at needle[j]. */ - rhaystack += table[j]; - j -= table[j]; - } - else - { - /* Found a mismatch at needle[0] already. */ - rhaystack++; - phaystack++; - } - } - - freea (table); - return true; -} +/* Knuth-Morris-Pratt algorithm. */ +#define CANON_ELEMENT(c) c_tolower (c) +#include "str-kmp.h" /* Find the first occurrence of NEEDLE in HAYSTACK, using case-insensitive comparison. @@ -215,7 +97,7 @@ c_strcasestr (const char *haystack, const char *needle) /* Try the Knuth-Morris-Pratt algorithm. */ const char *result; bool success = - knuth_morris_pratt (haystack, needle - 1, &result); + knuth_morris_pratt_unibyte (haystack, needle - 1, &result); if (success) return (char *) result; try_kmp = false; diff --git a/lib/c-strstr.c b/lib/c-strstr.c index 3652100e6..47226c3d4 100644 --- a/lib/c-strstr.c +++ b/lib/c-strstr.c @@ -26,126 +26,9 @@ #include "malloca.h" -/* Knuth-Morris-Pratt algorithm. - See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm - Return a boolean indicating success. */ -static bool -knuth_morris_pratt (const char *haystack, const char *needle, - const char **resultp) -{ - size_t m = strlen (needle); - - /* Allocate the table. */ - size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); - if (table == NULL) - return false; - /* Fill the table. - For 0 < i < m: - 0 < table[i] <= i is defined such that - forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], - and table[i] is as large as possible with this property. - This implies: - 1) For 0 < i < m: - If table[i] < i, - needle[table[i]..i-1] = needle[0..i-1-table[i]]. - 2) For 0 < i < m: - rhaystack[0..i-1] == needle[0..i-1] - and exists h, i <= h < m: rhaystack[h] != needle[h] - implies - forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. - table[0] remains uninitialized. */ - { - size_t i, j; - - /* i = 1: Nothing to verify for x = 0. */ - table[1] = 1; - j = 0; - - for (i = 2; i < m; i++) - { - /* Here: j = i-1 - table[i-1]. - The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold - for x < table[i-1], by induction. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - unsigned char b = (unsigned char) needle[i - 1]; - - for (;;) - { - /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] - is known to hold for x < i-1-j. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - if (b == (unsigned char) needle[j]) - { - /* Set table[i] := i-1-j. */ - table[i] = i - ++j; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for x = i-1-j, because - needle[i-1] != needle[j] = needle[i-1-x]. */ - if (j == 0) - { - /* The inequality holds for all possible x. */ - table[i] = i; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for i-1-j < x < i-1-j+table[j], because for these x: - needle[x..i-2] - = needle[x-(i-1-j)..j-1] - != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) - = needle[0..i-2-x], - hence needle[x..i-1] != needle[0..i-1-x]. - Furthermore - needle[i-1-j+table[j]..i-2] - = needle[table[j]..j-1] - = needle[0..j-1-table[j]] (by definition of table[j]). */ - j = j - table[j]; - } - /* Here: j = i - table[i]. */ - } - } - - /* Search, using the table to accelerate the processing. */ - { - size_t j; - const char *rhaystack; - const char *phaystack; - - *resultp = NULL; - j = 0; - rhaystack = haystack; - phaystack = haystack; - /* Invariant: phaystack = rhaystack + j. */ - while (*phaystack != '\0') - if ((unsigned char) needle[j] == (unsigned char) *phaystack) - { - j++; - phaystack++; - if (j == m) - { - /* The entire needle has been found. */ - *resultp = rhaystack; - break; - } - } - else if (j > 0) - { - /* Found a match of needle[0..j-1], mismatch at needle[j]. */ - rhaystack += table[j]; - j -= table[j]; - } - else - { - /* Found a mismatch at needle[0] already. */ - rhaystack++; - phaystack++; - } - } - - freea (table); - return true; -} +/* Knuth-Morris-Pratt algorithm. */ +#define CANON_ELEMENT(c) c +#include "str-kmp.h" /* Find the first occurrence of NEEDLE in HAYSTACK. */ char * @@ -210,7 +93,7 @@ c_strstr (const char *haystack, const char *needle) /* Try the Knuth-Morris-Pratt algorithm. */ const char *result; bool success = - knuth_morris_pratt (haystack, needle - 1, &result); + knuth_morris_pratt_unibyte (haystack, needle - 1, &result); if (success) return (char *) result; try_kmp = false; diff --git a/lib/mbscasestr.c b/lib/mbscasestr.c index 7205cca1e..82eb968ae 100644 --- a/lib/mbscasestr.c +++ b/lib/mbscasestr.c @@ -31,130 +31,14 @@ #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) +/* Knuth-Morris-Pratt algorithm. */ +#define CANON_ELEMENT(c) TOLOWER (c) +#include "str-kmp.h" + +#if HAVE_MBRTOWC /* Knuth-Morris-Pratt algorithm. See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm Return a boolean indicating success. */ - -static bool -knuth_morris_pratt_unibyte (const char *haystack, const char *needle, - const char **resultp) -{ - size_t m = strlen (needle); - - /* Allocate the table. */ - size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); - if (table == NULL) - return false; - /* Fill the table. - For 0 < i < m: - 0 < table[i] <= i is defined such that - forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], - and table[i] is as large as possible with this property. - This implies: - 1) For 0 < i < m: - If table[i] < i, - needle[table[i]..i-1] = needle[0..i-1-table[i]]. - 2) For 0 < i < m: - rhaystack[0..i-1] == needle[0..i-1] - and exists h, i <= h < m: rhaystack[h] != needle[h] - implies - forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. - table[0] remains uninitialized. */ - { - size_t i, j; - - /* i = 1: Nothing to verify for x = 0. */ - table[1] = 1; - j = 0; - - for (i = 2; i < m; i++) - { - /* Here: j = i-1 - table[i-1]. - The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold - for x < table[i-1], by induction. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - unsigned char b = TOLOWER ((unsigned char) needle[i - 1]); - - for (;;) - { - /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] - is known to hold for x < i-1-j. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - if (b == TOLOWER ((unsigned char) needle[j])) - { - /* Set table[i] := i-1-j. */ - table[i] = i - ++j; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for x = i-1-j, because - needle[i-1] != needle[j] = needle[i-1-x]. */ - if (j == 0) - { - /* The inequality holds for all possible x. */ - table[i] = i; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for i-1-j < x < i-1-j+table[j], because for these x: - needle[x..i-2] - = needle[x-(i-1-j)..j-1] - != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) - = needle[0..i-2-x], - hence needle[x..i-1] != needle[0..i-1-x]. - Furthermore - needle[i-1-j+table[j]..i-2] - = needle[table[j]..j-1] - = needle[0..j-1-table[j]] (by definition of table[j]). */ - j = j - table[j]; - } - /* Here: j = i - table[i]. */ - } - } - - /* Search, using the table to accelerate the processing. */ - { - size_t j; - const char *rhaystack; - const char *phaystack; - - *resultp = NULL; - j = 0; - rhaystack = haystack; - phaystack = haystack; - /* Invariant: phaystack = rhaystack + j. */ - while (*phaystack != '\0') - if (TOLOWER ((unsigned char) needle[j]) - == TOLOWER ((unsigned char) *phaystack)) - { - j++; - phaystack++; - if (j == m) - { - /* The entire needle has been found. */ - *resultp = rhaystack; - break; - } - } - else if (j > 0) - { - /* Found a match of needle[0..j-1], mismatch at needle[j]. */ - rhaystack += table[j]; - j -= table[j]; - } - else - { - /* Found a mismatch at needle[0] already. */ - rhaystack++; - phaystack++; - } - } - - freea (table); - return true; -} - -#if HAVE_MBRTOWC static bool knuth_morris_pratt_multibyte (const char *haystack, const char *needle, const char **resultp) diff --git a/lib/mbsstr.c b/lib/mbsstr.c index 420be0843..48837805b 100644 --- a/lib/mbsstr.c +++ b/lib/mbsstr.c @@ -28,129 +28,14 @@ # include "mbuiter.h" #endif +/* Knuth-Morris-Pratt algorithm. */ +#define CANON_ELEMENT(c) c +#include "str-kmp.h" + +#if HAVE_MBRTOWC /* Knuth-Morris-Pratt algorithm. See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm Return a boolean indicating success. */ - -static bool -knuth_morris_pratt_unibyte (const char *haystack, const char *needle, - const char **resultp) -{ - size_t m = strlen (needle); - - /* Allocate the table. */ - size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); - if (table == NULL) - return false; - /* Fill the table. - For 0 < i < m: - 0 < table[i] <= i is defined such that - forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], - and table[i] is as large as possible with this property. - This implies: - 1) For 0 < i < m: - If table[i] < i, - needle[table[i]..i-1] = needle[0..i-1-table[i]]. - 2) For 0 < i < m: - rhaystack[0..i-1] == needle[0..i-1] - and exists h, i <= h < m: rhaystack[h] != needle[h] - implies - forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. - table[0] remains uninitialized. */ - { - size_t i, j; - - /* i = 1: Nothing to verify for x = 0. */ - table[1] = 1; - j = 0; - - for (i = 2; i < m; i++) - { - /* Here: j = i-1 - table[i-1]. - The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold - for x < table[i-1], by induction. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - unsigned char b = (unsigned char) needle[i - 1]; - - for (;;) - { - /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] - is known to hold for x < i-1-j. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - if (b == (unsigned char) needle[j]) - { - /* Set table[i] := i-1-j. */ - table[i] = i - ++j; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for x = i-1-j, because - needle[i-1] != needle[j] = needle[i-1-x]. */ - if (j == 0) - { - /* The inequality holds for all possible x. */ - table[i] = i; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for i-1-j < x < i-1-j+table[j], because for these x: - needle[x..i-2] - = needle[x-(i-1-j)..j-1] - != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) - = needle[0..i-2-x], - hence needle[x..i-1] != needle[0..i-1-x]. - Furthermore - needle[i-1-j+table[j]..i-2] - = needle[table[j]..j-1] - = needle[0..j-1-table[j]] (by definition of table[j]). */ - j = j - table[j]; - } - /* Here: j = i - table[i]. */ - } - } - - /* Search, using the table to accelerate the processing. */ - { - size_t j; - const char *rhaystack; - const char *phaystack; - - *resultp = NULL; - j = 0; - rhaystack = haystack; - phaystack = haystack; - /* Invariant: phaystack = rhaystack + j. */ - while (*phaystack != '\0') - if ((unsigned char) needle[j] == (unsigned char) *phaystack) - { - j++; - phaystack++; - if (j == m) - { - /* The entire needle has been found. */ - *resultp = rhaystack; - break; - } - } - else if (j > 0) - { - /* Found a match of needle[0..j-1], mismatch at needle[j]. */ - rhaystack += table[j]; - j -= table[j]; - } - else - { - /* Found a mismatch at needle[0] already. */ - rhaystack++; - phaystack++; - } - } - - freea (table); - return true; -} - -#if HAVE_MBRTOWC static bool knuth_morris_pratt_multibyte (const char *haystack, const char *needle, const char **resultp) diff --git a/lib/str-kmp.h b/lib/str-kmp.h new file mode 100644 index 000000000..c7882d13d --- /dev/null +++ b/lib/str-kmp.h @@ -0,0 +1,148 @@ +/* Substring search in a NUL terminated string of 'char' elements, + using the Knuth-Morris-Pratt algorithm. + Copyright (C) 2005-2007 Free Software Foundation, Inc. + Written by Bruno Haible , 2005. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* Before including this file, you need to define: + CANON_ELEMENT(c) A macro that canonicalizes an element right after + it has been fetched from one of the two strings. + The argument is an 'unsigned char'; the result + must be an 'unsigned char' as well. */ + +/* Knuth-Morris-Pratt algorithm. + See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm + Return a boolean indicating success. */ +static bool +knuth_morris_pratt_unibyte (const char *haystack, const char *needle, + const char **resultp) +{ + size_t m = strlen (needle); + + /* Allocate the table. */ + size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); + if (table == NULL) + return false; + /* Fill the table. + For 0 < i < m: + 0 < table[i] <= i is defined such that + forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], + and table[i] is as large as possible with this property. + This implies: + 1) For 0 < i < m: + If table[i] < i, + needle[table[i]..i-1] = needle[0..i-1-table[i]]. + 2) For 0 < i < m: + rhaystack[0..i-1] == needle[0..i-1] + and exists h, i <= h < m: rhaystack[h] != needle[h] + implies + forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. + table[0] remains uninitialized. */ + { + size_t i, j; + + /* i = 1: Nothing to verify for x = 0. */ + table[1] = 1; + j = 0; + + for (i = 2; i < m; i++) + { + /* Here: j = i-1 - table[i-1]. + The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold + for x < table[i-1], by induction. + Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ + unsigned char b = CANON_ELEMENT ((unsigned char) needle[i - 1]); + + for (;;) + { + /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] + is known to hold for x < i-1-j. + Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ + if (b == CANON_ELEMENT ((unsigned char) needle[j])) + { + /* Set table[i] := i-1-j. */ + table[i] = i - ++j; + break; + } + /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds + for x = i-1-j, because + needle[i-1] != needle[j] = needle[i-1-x]. */ + if (j == 0) + { + /* The inequality holds for all possible x. */ + table[i] = i; + break; + } + /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds + for i-1-j < x < i-1-j+table[j], because for these x: + needle[x..i-2] + = needle[x-(i-1-j)..j-1] + != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) + = needle[0..i-2-x], + hence needle[x..i-1] != needle[0..i-1-x]. + Furthermore + needle[i-1-j+table[j]..i-2] + = needle[table[j]..j-1] + = needle[0..j-1-table[j]] (by definition of table[j]). */ + j = j - table[j]; + } + /* Here: j = i - table[i]. */ + } + } + + /* Search, using the table to accelerate the processing. */ + { + size_t j; + const char *rhaystack; + const char *phaystack; + + *resultp = NULL; + j = 0; + rhaystack = haystack; + phaystack = haystack; + /* Invariant: phaystack = rhaystack + j. */ + while (*phaystack != '\0') + if (CANON_ELEMENT ((unsigned char) needle[j]) + == CANON_ELEMENT ((unsigned char) *phaystack)) + { + j++; + phaystack++; + if (j == m) + { + /* The entire needle has been found. */ + *resultp = rhaystack; + break; + } + } + else if (j > 0) + { + /* Found a match of needle[0..j-1], mismatch at needle[j]. */ + rhaystack += table[j]; + j -= table[j]; + } + else + { + /* Found a mismatch at needle[0] already. */ + rhaystack++; + phaystack++; + } + } + + freea (table); + return true; +} + +#undef CANON_ELEMENT diff --git a/lib/strcasestr.c b/lib/strcasestr.c index dfbf925b7..34f36a788 100644 --- a/lib/strcasestr.c +++ b/lib/strcasestr.c @@ -29,127 +29,9 @@ #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) -/* Knuth-Morris-Pratt algorithm. - See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm - Return a boolean indicating success. */ -static bool -knuth_morris_pratt (const char *haystack, const char *needle, - const char **resultp) -{ - size_t m = strlen (needle); - - /* Allocate the table. */ - size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); - if (table == NULL) - return false; - /* Fill the table. - For 0 < i < m: - 0 < table[i] <= i is defined such that - forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], - and table[i] is as large as possible with this property. - This implies: - 1) For 0 < i < m: - If table[i] < i, - needle[table[i]..i-1] = needle[0..i-1-table[i]]. - 2) For 0 < i < m: - rhaystack[0..i-1] == needle[0..i-1] - and exists h, i <= h < m: rhaystack[h] != needle[h] - implies - forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. - table[0] remains uninitialized. */ - { - size_t i, j; - - /* i = 1: Nothing to verify for x = 0. */ - table[1] = 1; - j = 0; - - for (i = 2; i < m; i++) - { - /* Here: j = i-1 - table[i-1]. - The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold - for x < table[i-1], by induction. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - unsigned char b = TOLOWER ((unsigned char) needle[i - 1]); - - for (;;) - { - /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] - is known to hold for x < i-1-j. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - if (b == TOLOWER ((unsigned char) needle[j])) - { - /* Set table[i] := i-1-j. */ - table[i] = i - ++j; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for x = i-1-j, because - needle[i-1] != needle[j] = needle[i-1-x]. */ - if (j == 0) - { - /* The inequality holds for all possible x. */ - table[i] = i; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for i-1-j < x < i-1-j+table[j], because for these x: - needle[x..i-2] - = needle[x-(i-1-j)..j-1] - != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) - = needle[0..i-2-x], - hence needle[x..i-1] != needle[0..i-1-x]. - Furthermore - needle[i-1-j+table[j]..i-2] - = needle[table[j]..j-1] - = needle[0..j-1-table[j]] (by definition of table[j]). */ - j = j - table[j]; - } - /* Here: j = i - table[i]. */ - } - } - - /* Search, using the table to accelerate the processing. */ - { - size_t j; - const char *rhaystack; - const char *phaystack; - - *resultp = NULL; - j = 0; - rhaystack = haystack; - phaystack = haystack; - /* Invariant: phaystack = rhaystack + j. */ - while (*phaystack != '\0') - if (TOLOWER ((unsigned char) needle[j]) - == TOLOWER ((unsigned char) *phaystack)) - { - j++; - phaystack++; - if (j == m) - { - /* The entire needle has been found. */ - *resultp = rhaystack; - break; - } - } - else if (j > 0) - { - /* Found a match of needle[0..j-1], mismatch at needle[j]. */ - rhaystack += table[j]; - j -= table[j]; - } - else - { - /* Found a mismatch at needle[0] already. */ - rhaystack++; - phaystack++; - } - } - - freea (table); - return true; -} +/* Knuth-Morris-Pratt algorithm. */ +#define CANON_ELEMENT(c) TOLOWER (c) +#include "str-kmp.h" /* Find the first occurrence of NEEDLE in HAYSTACK, using case-insensitive comparison. @@ -212,7 +94,7 @@ strcasestr (const char *haystack, const char *needle) /* Try the Knuth-Morris-Pratt algorithm. */ const char *result; bool success = - knuth_morris_pratt (haystack, needle - 1, &result); + knuth_morris_pratt_unibyte (haystack, needle - 1, &result); if (success) return (char *) result; try_kmp = false; diff --git a/modules/c-strcasestr b/modules/c-strcasestr index 2e5164441..46739927f 100644 --- a/modules/c-strcasestr +++ b/modules/c-strcasestr @@ -4,6 +4,7 @@ Case-insensitive searching in a string in C locale. Files: lib/c-strcasestr.h lib/c-strcasestr.c +lib/str-kmp.h Depends-on: c-ctype diff --git a/modules/c-strstr b/modules/c-strstr index d2a989324..67cfd2e46 100644 --- a/modules/c-strstr +++ b/modules/c-strstr @@ -4,6 +4,7 @@ Search for a substring in a string in C locale. Files: lib/c-strstr.h lib/c-strstr.c +lib/str-kmp.h Depends-on: stdbool diff --git a/modules/mbscasestr b/modules/mbscasestr index a7453525d..994d471f1 100644 --- a/modules/mbscasestr +++ b/modules/mbscasestr @@ -3,6 +3,7 @@ mbscasestr() function: case-insensitive search for a substring in a string. Files: lib/mbscasestr.c +lib/str-kmp.h m4/mbscasestr.m4 m4/mbrtowc.m4 diff --git a/modules/mbsstr b/modules/mbsstr index 2502e600e..653078333 100644 --- a/modules/mbsstr +++ b/modules/mbsstr @@ -3,6 +3,7 @@ mbsstr() function: search for a substring in a string. Files: lib/mbsstr.c +lib/str-kmp.h m4/mbsstr.m4 m4/mbrtowc.m4 diff --git a/modules/strcasestr b/modules/strcasestr index a18015a91..884edfd9c 100644 --- a/modules/strcasestr +++ b/modules/strcasestr @@ -3,6 +3,7 @@ strcasestr() function: case-insensitive search for a substring in a string. Files: lib/strcasestr.c +lib/str-kmp.h m4/strcasestr.m4 Depends-on: -- 2.11.0