X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Ffstrcmp.c;h=4fdbcae739a1a87d5d8bc37af1024eace5a5a2c2;hb=b48afd89ec9be56156d6e2bebfdf457c778fe5c3;hp=796c5e88e10c005170ba047489839ad4074ff433;hpb=2d49258598687b9c408d662c498875a3ec38e4c8;p=gnulib.git diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c index 796c5e88e..4fdbcae73 100644 --- a/lib/fstrcmp.c +++ b/lib/fstrcmp.c @@ -1,6 +1,6 @@ /* Functions to make fuzzy comparisons between strings - Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008 - Free Software Foundation, Inc. + Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2010 Free + Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -85,8 +85,8 @@ already allocated memory, store the allocated memory per thread. Free it only when the thread exits. */ -static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ -static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ +static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ +static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ static void keys_init (void) @@ -100,6 +100,13 @@ keys_init (void) gl_once_define(static, keys_init_once) +/* In the code below, branch probabilities were measured by Ralf Wildenhues, + by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many + values of LL. The probability indicates that the condition evaluates + to true; whether that leads to a branch or a non-branch in the code, + depends on the compiler's reordering of basic blocks. */ + + double fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) { @@ -113,82 +120,82 @@ fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) size_t bufmax; /* short-circuit obvious comparisons */ - if (xvec_length == 0 || yvec_length == 0) + if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */ return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0); if (lower_bound > 0) { /* Compute a quick upper bound. - Each edit is an insertion or deletion of an element, hence modifies - the length of the sequence by at most 1. - Therefore, when starting from a sequence X and ending at a sequence Y, - with N edits, | yvec_length - xvec_length | <= N. (Proof by - induction over N.) - So, at the end, we will have - edit_count >= | xvec_length - yvec_length |. - and hence - result - = (xvec_length + yvec_length - edit_count) - / (xvec_length + yvec_length) - <= (xvec_length + yvec_length - | yvec_length - xvec_length |) - / (xvec_length + yvec_length) - = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). + Each edit is an insertion or deletion of an element, hence modifies + the length of the sequence by at most 1. + Therefore, when starting from a sequence X and ending at a sequence Y, + with N edits, | yvec_length - xvec_length | <= N. (Proof by + induction over N.) + So, at the end, we will have + edit_count >= | xvec_length - yvec_length |. + and hence + result + = (xvec_length + yvec_length - edit_count) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - | yvec_length - xvec_length |) + / (xvec_length + yvec_length) + = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). */ volatile double upper_bound = - (double) (2 * MIN (xvec_length, yvec_length)) - / (xvec_length + yvec_length); + (double) (2 * MIN (xvec_length, yvec_length)) + / (xvec_length + yvec_length); - if (upper_bound < lower_bound) - /* Return an arbitrary value < LOWER_BOUND. */ - return 0.0; + if (upper_bound < lower_bound) /* Prob: 74% */ + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; #if CHAR_BIT <= 8 /* When X and Y are both small, avoid the overhead of setting up an - array of size 256. */ - if (xvec_length + yvec_length >= 20) - { - /* Compute a less quick upper bound. - Each edit is an insertion or deletion of a character, hence - modifies the occurrence count of a character by 1 and leaves the - other occurrence counts unchanged. - Therefore, when starting from a sequence X and ending at a - sequence Y, and denoting the occurrence count of C in X with - OCC (X, C), with N edits, - sum_C | OCC (X, C) - OCC (Y, C) | <= N. - (Proof by induction over N.) - So, at the end, we will have - edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, - and hence - result - = (xvec_length + yvec_length - edit_count) - / (xvec_length + yvec_length) - <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) - / (xvec_length + yvec_length). - */ - int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ - int sum; - - /* Determine the occurrence counts in X. */ - memset (occ_diff, 0, sizeof (occ_diff)); - for (i = xvec_length - 1; i >= 0; i--) - occ_diff[(unsigned char) string1[i]]++; - /* Subtract the occurrence counts in Y. */ - for (i = yvec_length - 1; i >= 0; i--) - occ_diff[(unsigned char) string2[i]]--; - /* Sum up the absolute values. */ - sum = 0; - for (i = 0; i <= UCHAR_MAX; i++) - { - int d = occ_diff[i]; - sum += (d >= 0 ? d : -d); - } - - upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length); - - if (upper_bound < lower_bound) - /* Return an arbitrary value < LOWER_BOUND. */ - return 0.0; - } + array of size 256. */ + if (xvec_length + yvec_length >= 20) /* Prob: 99% */ + { + /* Compute a less quick upper bound. + Each edit is an insertion or deletion of a character, hence + modifies the occurrence count of a character by 1 and leaves the + other occurrence counts unchanged. + Therefore, when starting from a sequence X and ending at a + sequence Y, and denoting the occurrence count of C in X with + OCC (X, C), with N edits, + sum_C | OCC (X, C) - OCC (Y, C) | <= N. + (Proof by induction over N.) + So, at the end, we will have + edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, + and hence + result + = (xvec_length + yvec_length - edit_count) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) + / (xvec_length + yvec_length). + */ + int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ + int sum; + + /* Determine the occurrence counts in X. */ + memset (occ_diff, 0, sizeof (occ_diff)); + for (i = xvec_length - 1; i >= 0; i--) + occ_diff[(unsigned char) string1[i]]++; + /* Subtract the occurrence counts in Y. */ + for (i = yvec_length - 1; i >= 0; i--) + occ_diff[(unsigned char) string2[i]]--; + /* Sum up the absolute values. */ + sum = 0; + for (i = 0; i <= UCHAR_MAX; i++) + { + int d = occ_diff[i]; + sum += (d >= 0 ? d : -d); + } + + upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length); + + if (upper_bound < lower_bound) /* Prob: 66% */ + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; + } #endif } @@ -216,11 +223,11 @@ fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) /* Need more memory. */ bufmax = 2 * bufmax; if (fdiag_len > bufmax) - bufmax = fdiag_len; + bufmax = fdiag_len; /* Calling xrealloc would be a waste: buffer's contents does not need - to be preserved. */ + to be preserved. */ if (buffer != NULL) - free (buffer); + free (buffer); buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int)); gl_tls_set (buffer_key, buffer); gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); @@ -245,20 +252,20 @@ fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) /* Now do the main comparison algorithm */ ctxt.edit_count = - ctxt.edit_count_limit; - if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) + if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */ /* The edit_count passed the limit. Hence the result would be < lower_bound. We can return any value < lower_bound instead. */ return 0.0; ctxt.edit_count += ctxt.edit_count_limit; /* The result is - ((number of chars in common) / (average length of the strings)). + ((number of chars in common) / (average length of the strings)). The numerator is - = xvec_length - (number of calls to NOTE_DELETE) - = yvec_length - (number of calls to NOTE_INSERT) - = 1/2 * (xvec_length + yvec_length - (number of edits)). + = xvec_length - (number of calls to NOTE_DELETE) + = yvec_length - (number of calls to NOTE_INSERT) + = 1/2 * (xvec_length + yvec_length - (number of edits)). This is admittedly biased towards finding that the strings are similar, however it does produce meaningful results. */ return ((double) (xvec_length + yvec_length - ctxt.edit_count) - / (xvec_length + yvec_length)); + / (xvec_length + yvec_length)); }