1 /* Case-insensitive searching in a string.
2 Copyright (C) 2005-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2005.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
25 #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */
32 #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
34 /* Knuth-Morris-Pratt algorithm.
35 See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
36 Return a boolean indicating success. */
39 knuth_morris_pratt_unibyte (const char *haystack, const char *needle,
42 size_t m = strlen (needle);
44 /* Allocate the table. */
45 size_t *table = (size_t *) malloca (m * sizeof (size_t));
50 0 < table[i] <= i is defined such that
51 rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i]
53 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1],
54 and table[i] is as large as possible with this property.
55 table[0] remains uninitialized. */
61 for (i = 2; i < m; i++)
63 unsigned char b = TOLOWER ((unsigned char) needle[i - 1]);
67 if (b == TOLOWER ((unsigned char) needle[j]))
82 /* Search, using the table to accelerate the processing. */
85 const char *rhaystack;
86 const char *phaystack;
92 /* Invariant: phaystack = rhaystack + j. */
93 while (*phaystack != '\0')
94 if (TOLOWER ((unsigned char) needle[j])
95 == TOLOWER ((unsigned char) *phaystack))
101 /* The entire needle has been found. */
102 *resultp = rhaystack;
108 /* Found a match of needle[0..j-1], mismatch at needle[j]. */
109 rhaystack += table[j];
114 /* Found a mismatch at needle[0] already. */
126 knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
127 const char **resultp)
129 size_t m = mbslen (needle);
130 mbchar_t *needle_mbchars;
133 /* Allocate room for needle_mbchars and the table. */
134 char *memory = (char *) malloca (m * (sizeof (mbchar_t) + sizeof (size_t)));
137 needle_mbchars = (mbchar_t *) memory;
138 table = (size_t *) (memory + m * sizeof (mbchar_t));
140 /* Fill needle_mbchars. */
142 mbui_iterator_t iter;
146 for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
148 mb_copy (&needle_mbchars[j], &mbui_cur (iter));
149 if (needle_mbchars[j].wc_valid)
150 needle_mbchars[j].wc = towlower (needle_mbchars[j].wc);
156 0 < table[i] <= i is defined such that
157 rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i]
159 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1],
160 and table[i] is as large as possible with this property.
161 table[0] remains uninitialized. */
167 for (i = 2; i < m; i++)
169 mbchar_t *b = &needle_mbchars[i - 1];
173 if (mb_equal (*b, needle_mbchars[j]))
188 /* Search, using the table to accelerate the processing. */
191 mbui_iterator_t rhaystack;
192 mbui_iterator_t phaystack;
196 mbui_init (rhaystack, haystack);
197 mbui_init (phaystack, haystack);
198 /* Invariant: phaystack = rhaystack + j. */
199 while (mbui_avail (phaystack))
203 mb_copy (&c, &mbui_cur (phaystack));
205 c.wc = towlower (c.wc);
206 if (mb_equal (needle_mbchars[j], c))
209 mbui_advance (phaystack);
212 /* The entire needle has been found. */
213 *resultp = mbui_cur_ptr (rhaystack);
219 /* Found a match of needle[0..j-1], mismatch at needle[j]. */
220 size_t count = table[j];
222 for (; count > 0; count--)
224 if (!mbui_avail (rhaystack))
226 mbui_advance (rhaystack);
231 /* Found a mismatch at needle[0] already. */
232 if (!mbui_avail (rhaystack))
234 mbui_advance (rhaystack);
235 mbui_advance (phaystack);
245 /* Find the first occurrence of the character string NEEDLE in the character
246 string HAYSTACK, using case-insensitive comparison.
247 Note: This function may, in multibyte locales, return success even if
248 strlen (haystack) < strlen (needle) ! */
250 mbscasestr (const char *haystack, const char *needle)
252 /* Be careful not to look at the entire extent of haystack or needle
253 until needed. This is useful because of these two cases:
254 - haystack may be very long, and a match of needle found early,
255 - needle may be very long, and not even a short initial segment of
256 needle may be found in haystack. */
260 mbui_iterator_t iter_needle;
262 mbui_init (iter_needle, needle);
263 if (mbui_avail (iter_needle))
265 /* Minimizing the worst-case complexity:
266 Let n = mbslen(haystack), m = mbslen(needle).
267 The naïve algorithm is O(n*m) worst-case.
268 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
270 To achieve linear complexity and yet amortize the cost of the
271 memory allocation, we activate the Knuth-Morris-Pratt algorithm
272 only once the naïve algorithm has already run for some time; more
274 - the outer loop count is >= 10,
275 - the average number of comparisons per outer loop is >= 5,
276 - the total number of comparisons is >= m.
277 But we try it only once. If the memory allocation attempt failed,
278 we don't retry it. */
280 size_t outer_loop_count = 0;
281 size_t comparison_count = 0;
282 size_t last_ccount = 0; /* last comparison count */
283 mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
286 mbui_iterator_t iter_haystack;
288 mbui_init (iter_needle_last_ccount, needle);
290 mb_copy (&b, &mbui_cur (iter_needle));
292 b.wc = towlower (b.wc);
294 mbui_init (iter_haystack, haystack);
295 for (;; mbui_advance (iter_haystack))
299 if (!mbui_avail (iter_haystack))
303 /* See whether it's advisable to use an asymptotically faster
306 && outer_loop_count >= 10
307 && comparison_count >= 5 * outer_loop_count)
309 /* See if needle + comparison_count now reaches the end of
311 size_t count = comparison_count - last_ccount;
313 count > 0 && mbui_avail (iter_needle_last_ccount);
315 mbui_advance (iter_needle_last_ccount);
316 last_ccount = comparison_count;
317 if (!mbui_avail (iter_needle_last_ccount))
319 /* Try the Knuth-Morris-Pratt algorithm. */
322 knuth_morris_pratt_multibyte (haystack, needle,
325 return (char *) result;
332 mb_copy (&c, &mbui_cur (iter_haystack));
334 c.wc = towlower (c.wc);
336 /* The first character matches. */
338 mbui_iterator_t rhaystack;
339 mbui_iterator_t rneedle;
341 memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
342 mbui_advance (rhaystack);
344 mbui_init (rneedle, needle);
345 if (!mbui_avail (rneedle))
347 mbui_advance (rneedle);
349 for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
351 if (!mbui_avail (rneedle))
353 return (char *) mbui_cur_ptr (iter_haystack);
354 if (!mbui_avail (rhaystack))
358 if (!mb_caseequal (mbui_cur (rhaystack),
360 /* Nothing in this round. */
367 return (char *) haystack;
374 /* Minimizing the worst-case complexity:
375 Let n = strlen(haystack), m = strlen(needle).
376 The naïve algorithm is O(n*m) worst-case.
377 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
379 To achieve linear complexity and yet amortize the cost of the
380 memory allocation, we activate the Knuth-Morris-Pratt algorithm
381 only once the naïve algorithm has already run for some time; more
383 - the outer loop count is >= 10,
384 - the average number of comparisons per outer loop is >= 5,
385 - the total number of comparisons is >= m.
386 But we try it only once. If the memory allocation attempt failed,
387 we don't retry it. */
389 size_t outer_loop_count = 0;
390 size_t comparison_count = 0;
391 size_t last_ccount = 0; /* last comparison count */
392 const char *needle_last_ccount = needle; /* = needle + last_ccount */
394 /* Speed up the following searches of needle by caching its first
396 unsigned char b = TOLOWER ((unsigned char) *needle);
401 if (*haystack == '\0')
405 /* See whether it's advisable to use an asymptotically faster
408 && outer_loop_count >= 10
409 && comparison_count >= 5 * outer_loop_count)
411 /* See if needle + comparison_count now reaches the end of
413 if (needle_last_ccount != NULL)
415 needle_last_ccount +=
416 strnlen (needle_last_ccount,
417 comparison_count - last_ccount);
418 if (*needle_last_ccount == '\0')
419 needle_last_ccount = NULL;
420 last_ccount = comparison_count;
422 if (needle_last_ccount == NULL)
424 /* Try the Knuth-Morris-Pratt algorithm. */
427 knuth_morris_pratt_unibyte (haystack, needle - 1,
430 return (char *) result;
437 if (TOLOWER ((unsigned char) *haystack) == b)
438 /* The first character matches. */
440 const char *rhaystack = haystack + 1;
441 const char *rneedle = needle;
443 for (;; rhaystack++, rneedle++)
445 if (*rneedle == '\0')
447 return (char *) haystack;
448 if (*rhaystack == '\0')
452 if (TOLOWER ((unsigned char) *rhaystack)
453 != TOLOWER ((unsigned char) *rneedle))
454 /* Nothing in this round. */
461 return (char *) haystack;