X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Ffstrcmp.c;h=7bb9eef4f921671518da569389ac4f6edaa60c0b;hb=7ef6c64e210ac0979d7e8ac69bc5b5208c2405ab;hp=86a351fa20cbfd6bce6e6b1d7b8ba33ced61d9b3;hpb=4facad5a3d557a73ea0bb1cf8b6e835163e1def1;p=gnulib.git diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c index 86a351fa2..7bb9eef4f 100644 --- a/lib/fstrcmp.c +++ b/lib/fstrcmp.c @@ -1,6 +1,6 @@ /* Functions to make fuzzy comparisons between strings - Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008 - Free Software Foundation, Inc. + Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2014 Free + Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -65,11 +65,14 @@ #define EQUAL(x,y) ((x) == (y)) #define OFFSET int #define EXTRA_CONTEXT_FIELDS \ - /* The number of elements inserted or deleted. */ \ - int xvec_edit_count; \ - int yvec_edit_count; -#define NOTE_DELETE(ctxt, xoff) ctxt->xvec_edit_count++ -#define NOTE_INSERT(ctxt, yoff) ctxt->yvec_edit_count++ + /* The number of edits beyond which the computation can be aborted. */ \ + int edit_count_limit; \ + /* The number of edits (= number of elements inserted, plus the number of \ + elements deleted), temporarily minus edit_count_limit. */ \ + int edit_count; +#define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++ +#define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++ +#define EARLY_ABORT(ctxt) ctxt->edit_count > 0 /* We don't need USE_HEURISTIC, since it is unlikely in typical uses of fstrcmp(). */ #include "diffseq.h" @@ -82,8 +85,8 @@ already allocated memory, store the allocated memory per thread. Free it only when the thread exits. */ -static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ -static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ +static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ +static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ static void keys_init (void) @@ -97,6 +100,13 @@ keys_init (void) gl_once_define(static, keys_init_once) +/* In the code below, branch probabilities were measured by Ralf Wildenhues, + by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many + values of LL. The probability indicates that the condition evaluates + to true; whether that leads to a branch or a non-branch in the code, + depends on the compiler's reordering of basic blocks. */ + + double fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) { @@ -110,34 +120,83 @@ fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) size_t bufmax; /* short-circuit obvious comparisons */ - if (xvec_length == 0 || yvec_length == 0) + if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */ return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0); if (lower_bound > 0) { /* Compute a quick upper bound. - Each edit is an insertion or deletion of an element, hence modifies - the length of the sequence by at most 1. - Therefore, when starting from a sequence X and ending at a sequence Y, - with N edits, | yvec_length - xvec_length | <= N. (Proof by - induction over N.) - So, at the end, we will have - xvec_edit_count + yvec_edit_count >= | xvec_length - yvec_length |. - and hence - result - = (xvec_length + yvec_length - (xvec_edit_count + yvec_edit_count)) - / (xvec_length + yvec_length) - <= (xvec_length + yvec_length - | yvec_length - xvec_length |) - / (xvec_length + yvec_length) - = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). + Each edit is an insertion or deletion of an element, hence modifies + the length of the sequence by at most 1. + Therefore, when starting from a sequence X and ending at a sequence Y, + with N edits, | yvec_length - xvec_length | <= N. (Proof by + induction over N.) + So, at the end, we will have + edit_count >= | xvec_length - yvec_length |. + and hence + result + = (xvec_length + yvec_length - edit_count) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - | yvec_length - xvec_length |) + / (xvec_length + yvec_length) + = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). */ volatile double upper_bound = - (double) (2 * MIN (xvec_length, yvec_length)) - / (xvec_length + yvec_length); - - if (upper_bound < lower_bound) - /* Return an arbitrary value < LOWER_BOUND. */ - return 0.0; + (double) (2 * MIN (xvec_length, yvec_length)) + / (xvec_length + yvec_length); + + if (upper_bound < lower_bound) /* Prob: 74% */ + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; + +#if CHAR_BIT <= 8 + /* When X and Y are both small, avoid the overhead of setting up an + array of size 256. */ + if (xvec_length + yvec_length >= 20) /* Prob: 99% */ + { + /* Compute a less quick upper bound. + Each edit is an insertion or deletion of a character, hence + modifies the occurrence count of a character by 1 and leaves the + other occurrence counts unchanged. + Therefore, when starting from a sequence X and ending at a + sequence Y, and denoting the occurrence count of C in X with + OCC (X, C), with N edits, + sum_C | OCC (X, C) - OCC (Y, C) | <= N. + (Proof by induction over N.) + So, at the end, we will have + edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, + and hence + result + = (xvec_length + yvec_length - edit_count) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) + / (xvec_length + yvec_length). + */ + int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ + int sum; + + /* Determine the occurrence counts in X. */ + memset (occ_diff, 0, sizeof (occ_diff)); + for (i = xvec_length - 1; i >= 0; i--) + occ_diff[(unsigned char) string1[i]]++; + /* Subtract the occurrence counts in Y. */ + for (i = yvec_length - 1; i >= 0; i--) + occ_diff[(unsigned char) string2[i]]--; + /* Sum up the absolute values. */ + sum = 0; + for (i = 0; i <= UCHAR_MAX; i++) + { + int d = occ_diff[i]; + sum += (d >= 0 ? d : -d); + } + + upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length); + + if (upper_bound < lower_bound) /* Prob: 66% */ + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; + } +#endif } /* set the info for each string. */ @@ -164,11 +223,11 @@ fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) /* Need more memory. */ bufmax = 2 * bufmax; if (fdiag_len > bufmax) - bufmax = fdiag_len; + bufmax = fdiag_len; /* Calling xrealloc would be a waste: buffer's contents does not need - to be preserved. */ + to be preserved. */ if (buffer != NULL) - free (buffer); + free (buffer); buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int)); gl_tls_set (buffer_key, buffer); gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); @@ -176,17 +235,37 @@ fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) ctxt.fdiag = buffer + yvec_length + 1; ctxt.bdiag = ctxt.fdiag + fdiag_len; + /* The edit_count is only ever increased. The computation can be aborted + when + (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) + < lower_bound, + or equivalently + edit_count > (xvec_length + yvec_length) * (1 - lower_bound) + or equivalently + edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)). + We need to add an epsilon inside the floor(...) argument, to neutralize + rounding errors. */ + ctxt.edit_count_limit = + (lower_bound < 1.0 + ? (int) ((xvec_length + yvec_length) * (1.0 - lower_bound + 0.000001)) + : 0); + /* Now do the main comparison algorithm */ - ctxt.xvec_edit_count = 0; - ctxt.yvec_edit_count = 0; - compareseq (0, xvec_length, 0, yvec_length, 0, - &ctxt); + ctxt.edit_count = - ctxt.edit_count_limit; + if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */ + /* The edit_count passed the limit. Hence the result would be + < lower_bound. We can return any value < lower_bound instead. */ + return 0.0; + ctxt.edit_count += ctxt.edit_count_limit; /* The result is - ((number of chars in common) / (average length of the strings)). + ((number of chars in common) / (average length of the strings)). + The numerator is + = xvec_length - (number of calls to NOTE_DELETE) + = yvec_length - (number of calls to NOTE_INSERT) + = 1/2 * (xvec_length + yvec_length - (number of edits)). This is admittedly biased towards finding that the strings are similar, however it does produce meaningful results. */ - return ((double) (xvec_length + yvec_length - - ctxt.yvec_edit_count - ctxt.xvec_edit_count) - / (xvec_length + yvec_length)); + return ((double) (xvec_length + yvec_length - ctxt.edit_count) + / (xvec_length + yvec_length)); }