From 4facad5a3d557a73ea0bb1cf8b6e835163e1def1 Mon Sep 17 00:00:00 2001 From: Ralf Wildenhues Date: Sun, 14 Sep 2008 21:08:57 +0200 Subject: [PATCH] New function fstrcmp_bounded. --- ChangeLog | 8 +++++++ lib/fstrcmp.c | 62 ++++++++++++++++++++++++++++------------------------ lib/fstrcmp.h | 15 +++++++++++-- tests/test-fstrcmp.c | 29 ++++++++++++++++++++++-- 4 files changed, 82 insertions(+), 32 deletions(-) diff --git a/ChangeLog b/ChangeLog index 576e4b191..39d568307 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,13 @@ 2008-09-14 Ralf Wildenhues + * lib/fstrcmp.h (fstrcmp_bounded): New declaration. + (fstrcmp): Define in terms of fstrcmp_bounded. + * lib/fstrcmp.c (fstrcmp_bounded): Renamed from fstrcmp. Add lower_bound argument. + Return quickly if the result is certainly < lower_bound. + * tests/test-fstrcmp.c (check_fstrcmp): Test also fstrcmp_bounded. + +2008-09-14 Ralf Wildenhues + * lib/diffseq.h (EARLY_ABORT): New macro. (compareseq): Change return type to bool. Return true when EARLY_ABORT evaluates to true. diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c index 70e68185d..86a351fa2 100644 --- a/lib/fstrcmp.c +++ b/lib/fstrcmp.c @@ -97,46 +97,52 @@ keys_init (void) gl_once_define(static, keys_init_once) -/* NAME - fstrcmp - fuzzy string compare - - SYNOPSIS - double fstrcmp(const char *, const char *); - - DESCRIPTION - The fstrcmp function may be used to compare two string for - similarity. It is very useful in reducing "cascade" or - "secondary" errors in compilers or other situations where - symbol tables occur. - - RETURNS - double; 0 if the strings are entirly dissimilar, 1 if the - strings are identical, and a number in between if they are - similar. */ - double -fstrcmp (const char *string1, const char *string2) +fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) { struct context ctxt; - int xvec_length; - int yvec_length; + int xvec_length = strlen (string1); + int yvec_length = strlen (string2); int i; size_t fdiag_len; int *buffer; size_t bufmax; + /* short-circuit obvious comparisons */ + if (xvec_length == 0 || yvec_length == 0) + return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0); + + if (lower_bound > 0) + { + /* Compute a quick upper bound. + Each edit is an insertion or deletion of an element, hence modifies + the length of the sequence by at most 1. + Therefore, when starting from a sequence X and ending at a sequence Y, + with N edits, | yvec_length - xvec_length | <= N. (Proof by + induction over N.) + So, at the end, we will have + xvec_edit_count + yvec_edit_count >= | xvec_length - yvec_length |. + and hence + result + = (xvec_length + yvec_length - (xvec_edit_count + yvec_edit_count)) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - | yvec_length - xvec_length |) + / (xvec_length + yvec_length) + = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). + */ + volatile double upper_bound = + (double) (2 * MIN (xvec_length, yvec_length)) + / (xvec_length + yvec_length); + + if (upper_bound < lower_bound) + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; + } + /* set the info for each string. */ ctxt.xvec = string1; - xvec_length = strlen (string1); ctxt.yvec = string2; - yvec_length = strlen (string2); - - /* short-circuit obvious comparisons */ - if (xvec_length == 0 && yvec_length == 0) - return 1.0; - if (xvec_length == 0 || yvec_length == 0) - return 0.0; /* Set TOO_EXPENSIVE to be approximate square root of input size, bounded below by 256. */ diff --git a/lib/fstrcmp.h b/lib/fstrcmp.h index 76fa393d2..807b0ae64 100644 --- a/lib/fstrcmp.h +++ b/lib/fstrcmp.h @@ -1,5 +1,6 @@ /* Fuzzy string comparison. - Copyright (C) 1995, 2000, 2002-2003, 2006 Free Software Foundation, Inc. + Copyright (C) 1995, 2000, 2002-2003, 2006, 2008 Free Software + Foundation, Inc. This file was written by Peter Miller @@ -24,9 +25,19 @@ extern "C" { #endif /* Fuzzy compare of S1 and S2. Return a measure for the similarity of S1 - and S1. The higher the result, the more similar the strings are. */ + and S1. The higher the result, the more similar the strings are. + The result is bounded between 0 (meaning very dissimilar strings) and + 1 (meaning identical strings). */ extern double fstrcmp (const char *s1, const char *s2); +/* Like fstrcmp (S1, S2), except that if the result is < LOWER_BOUND, an + arbitrary other value < LOWER_BOUND can be returned. */ +extern double fstrcmp_bounded (const char *s1, const char *s2, + double lower_bound); + +/* A shortcut for fstrcmp. Avoids a function call. */ +#define fstrcmp(s1,s2) fstrcmp_bounded (s1, s2, 0.0) + #ifdef __cplusplus } #endif diff --git a/tests/test-fstrcmp.c b/tests/test-fstrcmp.c index 0e17ff568..b266b5946 100644 --- a/tests/test-fstrcmp.c +++ b/tests/test-fstrcmp.c @@ -45,8 +45,33 @@ check_fstrcmp (const char *string1, const char *string2, double expected) compliant by default, to avoid that msgmerge results become platform and compiler option dependent. 'volatile' is a portable alternative to gcc's -ffloat-store option. */ - volatile double result = fstrcmp (string1, string2); - return (result == expected); + { + volatile double result = fstrcmp (string1, string2); + if (!(result == expected)) + return false; + } + { + volatile double result = fstrcmp_bounded (string1, string2, expected); + if (!(result == expected)) + return false; + } + { + double bound = expected * 0.5; /* implies bound <= expected */ + volatile double result = fstrcmp_bounded (string1, string2, bound); + if (!(result == expected)) + return false; + } + { + double bound = (1 + expected) * 0.5; + if (expected < bound) + { + volatile double result = fstrcmp_bounded (string1, string2, bound); + if (!(result < bound)) + return false; + } + } + + return true; } int -- 2.11.0