From: Eric Blake Date: Fri, 11 Jan 2008 05:22:51 +0000 (-0700) Subject: Convert strcasestr module to use Two-Way algorithm. X-Git-Tag: v0.1~7830 X-Git-Url: https://erislabs.net/gitweb/?a=commitdiff_plain;h=9c063a2afdc2f2f6a1da2bb2ec54eadbae42a0ab;p=gnulib.git Convert strcasestr module to use Two-Way algorithm. * modules/strcasestr-simple: New module, based on the old strcasestr, but with Two-Way rather than KMP. * modules/strcasestr (Depends-on): Change to strcasestr-simple. * lib/string.in.h (rpl_strcasestr): Declare. * m4/strcasestr.m4 (gl_FUNC_STRCASESTR): Check for linear performance. * lib/strcasestr.c (strcasestr): Simplify, and avoid malloc. * modules/string (Makefile.am): Support strcasestr. * m4/string_h.m4 (gl_HEADER_STRING_H_DEFAULTS): Likewise. * modules/strcasestr-tests (Depends-on): Check for alarm. * tests/test-strcasestr.c: Augment test. * lib/str-two-way.h: Clean up stray macro. * NEWS: Document new module. * MODULES.html.sh (string handling): Likewise. * doc/functions/strcasestr.texi: New file. * doc/gnulib.texi (Function Substitutes): New node. Move memmem here, since it is not a POSIX function. Signed-off-by: Eric Blake --- diff --git a/ChangeLog b/ChangeLog index 0e594bd6f..be6a23fdb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +2008-01-14 Eric Blake + + Convert strcasestr module to use Two-Way algorithm. + * modules/strcasestr-simple: New module, based on the old + strcasestr, but with Two-Way rather than KMP. + * modules/strcasestr (Depends-on): Change to strcasestr-simple. + * lib/string.in.h (rpl_strcasestr): Declare. + * m4/strcasestr.m4 (gl_FUNC_STRCASESTR): Check for linear + performance. + * lib/strcasestr.c (strcasestr): Simplify, and avoid malloc. + * modules/string (Makefile.am): Support strcasestr. + * m4/string_h.m4 (gl_HEADER_STRING_H_DEFAULTS): Likewise. + * modules/strcasestr-tests (Depends-on): Check for alarm. + * tests/test-strcasestr.c: Augment test. + * lib/str-two-way.h: Clean up stray macro. + * NEWS: Document new module. + * MODULES.html.sh (string handling): Likewise. + * doc/functions/strcasestr.texi: New file. + * doc/gnulib.texi (Function Substitutes): New node. Move memmem + here, since it is not a POSIX function. + 2008-01-14 Colin Watson Bruno Haible diff --git a/MODULES.html.sh b/MODULES.html.sh index 1e6f16bf0..53bf494b3 100755 --- a/MODULES.html.sh +++ b/MODULES.html.sh @@ -1664,6 +1664,7 @@ func_all_modules () func_module c-strcaseeq func_module c-strcasestr func_module strcasestr + func_module strcasestr-simple func_module strchrnul func_module strdup func_module streq diff --git a/NEWS b/NEWS index 89ecd9fd8..568d57af8 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,12 @@ User visible incompatible changes Date Modules Changes +2008-01-14 strcasestr This module now replaces worst-case inefficient + implementations; clients that use controlled + needles and thus do not care about worst-case + efficiency should use the new strcasestr-simple + module instead for smaller code size. + 2008-01-09 alloca-opt Now defines HAVE_ALLOCA_H only when the system supplies an . Gnulib-using code is now expected to include unconditionally. diff --git a/doc/functions/strcasestr.texi b/doc/functions/strcasestr.texi new file mode 100644 index 000000000..bd6ed4453 --- /dev/null +++ b/doc/functions/strcasestr.texi @@ -0,0 +1,29 @@ +@node strcasestr +@section @code{strcasestr} +@findex strcasestr + +Unspecified by POSIX, but comparable to a mix of @code{strstr} and +@code{strcasecmp}. + +Gnulib module: strcasestr or strcasestr-simple + +Portability problems fixed by either Gnulib module @code{strcasestr-simple} +or @code{strcasestr}: +@itemize +@item +This function is missing on some platforms: +MacOS X 10.3, FreeBSD 5.2.1, OpenBSD 4.0, AIX 4.3.2, HP-UX 11, IRIX +6.5, OSF/1 5.1, Solaris 10, Cygwin 1.5.x, mingw, Interix 3.5, BeOS. +@end itemize + +Portability problems fixed by Gnulib module @code{strcasestr}: +@itemize +@item +This function has quadratic instead of linear worst-case complexity on some +platforms: +glibc 2.6.1, FreeBSD 6.2, NetBSD 3.0, AIX 5.1. +@end itemize + +Portability problems not fixed by Gnulib: +@itemize +@end itemize diff --git a/doc/gnulib.texi b/doc/gnulib.texi index fdab409ef..7d68905e3 100644 --- a/doc/gnulib.texi +++ b/doc/gnulib.texi @@ -644,7 +644,7 @@ The notation ``Gnulib module: ---'' means that Gnulib does not provide a module providing a substitute for the function. When the list ``Portability problems not fixed by Gnulib'' is empty, such a module is not needed: No portability problems are known. Otherwise, it indicates -that such a module would be useful but is not available: Noone so far +that such a module would be useful but is not available: No one so far found this function important enough to contribute a substitute for it. If you need this particular function, you may write to @code{}. @@ -1176,7 +1176,6 @@ If you need this particular function, you may write to * memchr:: * memcmp:: * memcpy:: -* memmem:: * memmove:: * memset:: * mkdir:: @@ -2962,6 +2961,31 @@ not worked around by Gnulib. @include glibc-headers/sysexits.texi @include glibc-headers/ttyent.texi +@node Glibc Function Substitutes +@chapter Glibc Function Substitutes + +This chapter describes which functions and function-like macros +provided as extensions by at least glibc are also supported by Gnulib, +which portability pitfalls are fixed by Gnulib, and which (known) +portability problems are not worked around by Gnulib. + +The notation ``Gnulib module: ---'' means that Gnulib does not provide a +module providing a substitute for the function. When the list +``Portability problems not fixed by Gnulib'' is empty, such a module is +not needed: No portability problems are known. Otherwise, it indicates +that such a module would be useful but is not available: No one so far +found this function important enough to contribute a substitute for it. +If you need this particular function, you may write to +@code{}. + +@menu +* memmem:: +* strcasestr:: +@end menu + +@include functions/memmem.texi +@include functions/strcasestr.texi + @node Particular Modules @chapter Particular Modules diff --git a/lib/str-two-way.h b/lib/str-two-way.h index 3aa3a1b76..d144ac95b 100644 --- a/lib/str-two-way.h +++ b/lib/str-two-way.h @@ -422,5 +422,6 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, #undef AVAILABLE #undef CANON_ELEMENT +#undef CMP_FUNC #undef MAX #undef RETURN_TYPE diff --git a/lib/strcasestr.c b/lib/strcasestr.c index 34f36a788..fe970ae84 100644 --- a/lib/strcasestr.c +++ b/lib/strcasestr.c @@ -1,5 +1,5 @@ /* Case-insensitive searching in a string. - Copyright (C) 2005-2007 Free Software Foundation, Inc. + Copyright (C) 2005-2008 Free Software Foundation, Inc. Written by Bruno Haible , 2005. This program is free software; you can redistribute it and/or modify @@ -23,109 +23,61 @@ #include #include -#include /* for NULL, in case a nonstandard string.h lacks it */ - -#include "malloca.h" +#include #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) -/* Knuth-Morris-Pratt algorithm. */ +/* Two-Way algorithm. */ +#define RETURN_TYPE char * +#define AVAILABLE(h, h_l, j, n_l) \ + (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ + && ((h_l) = (j) + (n_l))) #define CANON_ELEMENT(c) TOLOWER (c) -#include "str-kmp.h" +#define CMP_FUNC(p1, p2, l) \ + strncasecmp ((const char *) (p1), (const char *) (p2), l) +#include "str-two-way.h" -/* Find the first occurrence of NEEDLE in HAYSTACK, using case-insensitive - comparison. - Note: This function may, in multibyte locales, return success even if - strlen (haystack) < strlen (needle) ! */ +/* Find the first occurrence of NEEDLE in HAYSTACK, using + case-insensitive comparison. This function gives unspecified + results in multibyte locales. */ char * -strcasestr (const char *haystack, const char *needle) +strcasestr (const char *haystack_start, const char *needle_start) { - if (*needle != '\0') - { - /* Minimizing the worst-case complexity: - Let n = strlen(haystack), m = strlen(needle). - The naïve algorithm is O(n*m) worst-case. - The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a - memory allocation. - To achieve linear complexity and yet amortize the cost of the memory - allocation, we activate the Knuth-Morris-Pratt algorithm only once - the naïve algorithm has already run for some time; more precisely, - when - - the outer loop count is >= 10, - - the average number of comparisons per outer loop is >= 5, - - the total number of comparisons is >= m. - But we try it only once. If the memory allocation attempt failed, - we don't retry it. */ - bool try_kmp = true; - size_t outer_loop_count = 0; - size_t comparison_count = 0; - size_t last_ccount = 0; /* last comparison count */ - const char *needle_last_ccount = needle; /* = needle + last_ccount */ - - /* Speed up the following searches of needle by caching its first - character. */ - unsigned char b = TOLOWER ((unsigned char) *needle); + const char *haystack = haystack_start; + const char *needle = needle_start; + size_t needle_len; /* Length of NEEDLE. */ + size_t haystack_len; /* Known minimum length of HAYSTACK. */ + bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */ + /* Determine length of NEEDLE, and in the process, make sure + HAYSTACK is at least as long (no point processing all of a long + NEEDLE if HAYSTACK is too short). */ + while (*haystack && *needle) + { + ok &= (TOLOWER ((unsigned char) *haystack) + == TOLOWER ((unsigned char) *needle)); + haystack++; needle++; - for (;; haystack++) - { - if (*haystack == '\0') - /* No match. */ - return NULL; - - /* See whether it's advisable to use an asymptotically faster - algorithm. */ - if (try_kmp - && outer_loop_count >= 10 - && comparison_count >= 5 * outer_loop_count) - { - /* See if needle + comparison_count now reaches the end of - needle. */ - if (needle_last_ccount != NULL) - { - needle_last_ccount += - strnlen (needle_last_ccount, comparison_count - last_ccount); - if (*needle_last_ccount == '\0') - needle_last_ccount = NULL; - last_ccount = comparison_count; - } - if (needle_last_ccount == NULL) - { - /* Try the Knuth-Morris-Pratt algorithm. */ - const char *result; - bool success = - knuth_morris_pratt_unibyte (haystack, needle - 1, &result); - if (success) - return (char *) result; - try_kmp = false; - } - } - - outer_loop_count++; - comparison_count++; - if (TOLOWER ((unsigned char) *haystack) == b) - /* The first character matches. */ - { - const char *rhaystack = haystack + 1; - const char *rneedle = needle; - - for (;; rhaystack++, rneedle++) - { - if (*rneedle == '\0') - /* Found a match. */ - return (char *) haystack; - if (*rhaystack == '\0') - /* No match. */ - return NULL; - comparison_count++; - if (TOLOWER ((unsigned char) *rhaystack) - != TOLOWER ((unsigned char) *rneedle)) - /* Nothing in this round. */ - break; - } - } - } } - else - return (char *) haystack; + if (*needle) + return NULL; + if (ok) + return (char *) haystack_start; + needle_len = needle - needle_start; + haystack = haystack_start + 1; + haystack_len = needle_len - 1; + + /* Perform the search. Abstract memory is considered to be an array + of 'unsigned char' values, not an array of 'char' values. See + ISO C 99 section 6.2.6.1. */ + if (needle_len < LONG_NEEDLE_THRESHOLD) + return two_way_short_needle ((const unsigned char *) haystack, + haystack_len, + (const unsigned char *) needle_start, + needle_len); + return two_way_long_needle ((const unsigned char *) haystack, haystack_len, + (const unsigned char *) needle_start, + needle_len); } + +#undef LONG_NEEDLE_THRESHOLD diff --git a/lib/string.in.h b/lib/string.in.h index 7e6bd088e..d88e8a73e 100644 --- a/lib/string.in.h +++ b/lib/string.in.h @@ -313,11 +313,15 @@ char *strstr (const char *haystack, const char *needle) /* Find the first occurrence of NEEDLE in HAYSTACK, using case-insensitive comparison. */ -#if ! @HAVE_STRCASESTR@ +#if @GNULIB_STRCASESTR@ +# if @REPLACE_STRCASESTR@ +# define strcasestr rpl_strcasestr +# endif +# if ! @HAVE_STRCASESTR@ || @REPLACE_STRCASESTR@ extern char *strcasestr (const char *haystack, const char *needle) __attribute__ ((__pure__)); -#endif -#if defined GNULIB_POSIXCHECK +# endif +#elif defined GNULIB_POSIXCHECK /* strcasestr() does not work with multibyte strings: It is a glibc extension, and glibc implements it only for unibyte locales. */ diff --git a/m4/strcasestr.m4 b/m4/strcasestr.m4 index 52f3a58eb..1edf8e288 100644 --- a/m4/strcasestr.m4 +++ b/m4/strcasestr.m4 @@ -1,18 +1,63 @@ -# strcasestr.m4 serial 6 -dnl Copyright (C) 2005, 2007 Free Software Foundation, Inc. +# strcasestr.m4 serial 7 +dnl Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, dnl with or without modifications, as long as this notice is preserved. -AC_DEFUN([gl_FUNC_STRCASESTR], +dnl Check that strcasestr is present. +AC_DEFUN([gl_FUNC_STRCASESTR_SIMPLE], [ + dnl Persuade glibc to declare strcasestr(). + AC_REQUIRE([AC_USE_SYSTEM_EXTENSIONS]) + AC_REQUIRE([gl_HEADER_STRING_H_DEFAULTS]) - AC_REPLACE_FUNCS(strcasestr) + AC_REPLACE_FUNCS([strcasestr]) if test $ac_cv_func_strcasestr = no; then HAVE_STRCASESTR=0 gl_PREREQ_STRCASESTR fi -]) +]) # gl_FUNC_STRCASESTR_SIMPLE + +dnl Additionally, check that strcasestr is efficient. +AC_DEFUN([gl_FUNC_STRCASESTR]) +[ + AC_REQUIRE([gl_FUNC_STRCASESTR_SIMPLE]) + if test $ac_cv_func_strcasestr = yes; then + AC_CACHE_CHECK([whether strcasestr works in linear time], + [gl_cv_func_strcasestr_linear], + [AC_RUN_IFELSE([AC_LANG_PROGRAM([ +#include /* for memmem */ +#include /* for malloc */ +#include /* for alarm */ +], [[size_t m = 1000000; + char *haystack = (char *) malloc (2 * m + 2); + char *needle = (char *) malloc (m + 2); + void *result = 0; + /* Failure to compile this test due to missing alarm is okay, + since all such platforms (mingw) also lack strcasestr. */ + alarm (5); + /* Check for quadratic performance. */ + if (haystack && needle) + { + memset (haystack, 'A', 2 * m); + haystack[2 * m] = 'B'; + haystack[2 * m + 1] = 0; + memset (needle, 'A', m); + needle[m] = 'B'; + needle[m + 1] = 0; + result = strcasestr (haystack, needle); + } + return !result;]])], + [gl_cv_func_strcasestr_linear=yes], [gl_cv_func_strcasestr_linear=no], + [dnl pessimistically assume the worst, since even glibc 2.6.1 + dnl has quadratic complexity in its strcasestr + gl_cv_func_strcasestr_linear="guessing no"])]) + if test "$gl_cv_func_strcasestr_linear" != yes; then + REPLACE_STRCASESTR=1 + AC_LIBOBJ([strcasestr]) + fi + fi +]) # gl_FUNC_STRCASESTR # Prerequisites of lib/strcasestr.c. AC_DEFUN([gl_PREREQ_STRCASESTR], [ diff --git a/m4/string_h.m4 b/m4/string_h.m4 index 02202b695..766d7e984 100644 --- a/m4/string_h.m4 +++ b/m4/string_h.m4 @@ -82,5 +82,6 @@ AC_DEFUN([gl_HEADER_STRING_H_DEFAULTS], REPLACE_STRERROR=0; AC_SUBST([REPLACE_STRERROR]) REPLACE_STRSIGNAL=0; AC_SUBST([REPLACE_STRSIGNAL]) REPLACE_MEMMEM=0; AC_SUBST([REPLACE_MEMMEM]) + REPLACE_STRCASESTR=0; AC_SUBST([REPLACE_STRCASESTR]) REPLACE_STRSTR=0; AC_SUBST([REPLACE_STRSTR]) ]) diff --git a/modules/strcasestr b/modules/strcasestr index 884edfd9c..ea0ed1cc8 100644 --- a/modules/strcasestr +++ b/modules/strcasestr @@ -1,20 +1,13 @@ Description: -strcasestr() function: case-insensitive search for a substring in a string. +strcasestr() function: efficient case-insensitive search for unibyte substring. Files: -lib/strcasestr.c -lib/str-kmp.h -m4/strcasestr.m4 Depends-on: -string -stdbool -malloca -strnlen +strcasestr-simple configure.ac: gl_FUNC_STRCASESTR -gl_STRING_MODULE_INDICATOR([strcasestr]) Makefile.am: diff --git a/modules/strcasestr-simple b/modules/strcasestr-simple new file mode 100644 index 000000000..979c992c1 --- /dev/null +++ b/modules/strcasestr-simple @@ -0,0 +1,28 @@ +Description: +strcasestr() function: case-insensitive search for unibyte substring. + +Files: +lib/strcasestr.c +lib/str-two-way.h +m4/strcasestr.m4 + +Depends-on: +string +stdbool +strcase + +configure.ac: +gl_FUNC_STRCASESTR_SIMPLE +gl_STRING_MODULE_INDICATOR([strcasestr]) + +Makefile.am: + +Include: + + +License: +LGPLv2+ + +Maintainer: +Bruno Haible + diff --git a/modules/strcasestr-tests b/modules/strcasestr-tests index e472d5bf5..e5262cca6 100644 --- a/modules/strcasestr-tests +++ b/modules/strcasestr-tests @@ -4,6 +4,7 @@ tests/test-strcasestr.c Depends-on: configure.ac: +AC_CHECK_DECLS_ONCE([alarm]) Makefile.am: TESTS += test-strcasestr diff --git a/modules/string b/modules/string index f59a11e35..52af9597f 100644 --- a/modules/string +++ b/modules/string @@ -70,6 +70,7 @@ string.h: string.in.h -e 's|@''HAVE_DECL_STRERROR''@|$(HAVE_DECL_STRERROR)|g' \ -e 's|@''HAVE_DECL_STRSIGNAL''@|$(HAVE_DECL_STRSIGNAL)|g' \ -e 's|@''REPLACE_MEMMEM''@|$(REPLACE_MEMMEM)|g' \ + -e 's|@''REPLACE_STRCASESTR''@|$(REPLACE_STRCASESTR)|g' \ -e 's|@''REPLACE_STRSTR''@|$(REPLACE_STRSTR)|g' \ -e 's|@''REPLACE_STRERROR''@|$(REPLACE_STRERROR)|g' \ -e 's|@''REPLACE_STRSIGNAL''@|$(REPLACE_STRSIGNAL)|g' \ diff --git a/tests/test-strcasestr.c b/tests/test-strcasestr.c index ceaab7932..9db62f175 100644 --- a/tests/test-strcasestr.c +++ b/tests/test-strcasestr.c @@ -1,5 +1,5 @@ /* Test of case-insensitive searching in a string. - Copyright (C) 2007 Free Software Foundation, Inc. + Copyright (C) 2007, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,6 +22,7 @@ #include #include +#include #define ASSERT(expr) \ do \ @@ -37,6 +38,14 @@ int main () { +#if HAVE_DECL_ALARM + /* Declare failure if test takes too long, by using default abort + caused by SIGALRM. All known platforms that lack alarm also lack + memmem, and the replacement memmem is known to not take too + long. */ + alarm (50); +#endif + { const char input[] = "foo"; const char *result = strcasestr (input, ""); @@ -61,6 +70,12 @@ main () ASSERT (result == NULL); } + { + const char input[] = "ABC ABCDAB ABCDABCDABDE"; + const char *result = strcasestr (input, "ABCDaBCD"); + ASSERT (result == input + 11); + } + /* Check that a very long haystack is handled quickly if the needle is short and occurs near the beginning. */ { @@ -110,7 +125,6 @@ main () } /* Check that the asymptotic worst-case complexity is not quadratic. */ -#if !HAVE_STRCASESTR /* The system's strcasestr() function fails this test. */ { size_t m = 1000000; char *haystack = (char *) malloc (2 * m + 2); @@ -135,7 +149,6 @@ main () if (haystack != NULL) free (haystack); } -#endif return 0; }