lib/fstrcmp.c

   1 /* Functions to make fuzzy comparisons between strings
   2    Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2014 Free
   3    Software Foundation, Inc.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17
  18
  19    Derived from GNU diff 2.7, analyze.c et al.
  20
  21    The basic idea is to consider two vectors as similar if, when
  22    transforming the first vector into the second vector through a
  23    sequence of edits (inserts and deletes of one element each),
  24    this sequence is short - or equivalently, if the ordered list
  25    of elements that are untouched by these edits is long.  For a
  26    good introduction to the subject, read about the "Levenshtein
  27    distance" in Wikipedia.
  28
  29    The basic algorithm is described in:
  30    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
  31    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
  32    see especially section 4.2, which describes the variation used below.
  33
  34    The basic algorithm was independently discovered as described in:
  35    "Algorithms for Approximate String Matching", E. Ukkonen,
  36    Information and Control Vol. 64, 1985, pp. 100-118.
  37
  38    Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
  39    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
  40    at the price of producing suboptimal output for large inputs with
  41    many differences.  */
  42
  43 #include <config.h>
  44
  45 /* Specification.  */
  46 #include "fstrcmp.h"
  47
  48 #include <string.h>
  49 #include <stdbool.h>
  50 #include <stdio.h>
  51 #include <stdlib.h>
  52 #include <limits.h>
  53
  54 #include "glthread/lock.h"
  55 #include "glthread/tls.h"
  56 #include "minmax.h"
  57 #include "xalloc.h"
  58
  59 #ifndef uintptr_t
  60 # define uintptr_t unsigned long
  61 #endif
  62
  63
  64 #define ELEMENT char
  65 #define EQUAL(x,y) ((x) == (y))
  66 #define OFFSET int
  67 #define EXTRA_CONTEXT_FIELDS \
  68   /* The number of edits beyond which the computation can be aborted. */ \
  69   int edit_count_limit; \
  70   /* The number of edits (= number of elements inserted, plus the number of \
  71      elements deleted), temporarily minus edit_count_limit. */ \
  72   int edit_count;
  73 #define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++
  74 #define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++
  75 #define EARLY_ABORT(ctxt) ctxt->edit_count > 0
  76 /* We don't need USE_HEURISTIC, since it is unlikely in typical uses of
  77    fstrcmp().  */
  78 #include "diffseq.h"
  79
  80
  81 /* Because fstrcmp is typically called multiple times, attempt to minimize
  82    the number of memory allocations performed.  Thus, let a call reuse the
  83    memory already allocated by the previous call, if it is sufficient.
  84    To make it multithread-safe, without need for a lock that protects the
  85    already allocated memory, store the allocated memory per thread.  Free
  86    it only when the thread exits.  */
  87
  88 static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */
  89 static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */
  90
  91 static void
  92 keys_init (void)
  93 {
  94   gl_tls_key_init (buffer_key, free);
  95   gl_tls_key_init (bufmax_key, NULL);
  96   /* The per-thread initial values are NULL and 0, respectively.  */
  97 }
  98
  99 /* Ensure that keys_init is called once only.  */
 100 gl_once_define(static, keys_init_once)
 101
 102
 103 /* In the code below, branch probabilities were measured by Ralf Wildenhues,
 104    by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many
 105    values of LL.  The probability indicates that the condition evaluates
 106    to true; whether that leads to a branch or a non-branch in the code,
 107    depends on the compiler's reordering of basic blocks.  */
 108
 109
 110 double
 111 fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
 112 {
 113   struct context ctxt;
 114   int xvec_length = strlen (string1);
 115   int yvec_length = strlen (string2);
 116   int i;
 117
 118   size_t fdiag_len;
 119   int *buffer;
 120   size_t bufmax;
 121
 122   /* short-circuit obvious comparisons */
 123   if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */
 124     return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0);
 125
 126   if (lower_bound > 0)
 127     {
 128       /* Compute a quick upper bound.
 129          Each edit is an insertion or deletion of an element, hence modifies
 130          the length of the sequence by at most 1.
 131          Therefore, when starting from a sequence X and ending at a sequence Y,
 132          with N edits,  | yvec_length - xvec_length | <= N.  (Proof by
 133          induction over N.)
 134          So, at the end, we will have
 135            edit_count >= | xvec_length - yvec_length |.
 136          and hence
 137            result
 138              = (xvec_length + yvec_length - edit_count)
 139                / (xvec_length + yvec_length)
 140              <= (xvec_length + yvec_length - | yvec_length - xvec_length |)
 141                 / (xvec_length + yvec_length)
 142              = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length).
 143        */
 144       volatile double upper_bound =
 145         (double) (2 * MIN (xvec_length, yvec_length))
 146         / (xvec_length + yvec_length);
 147
 148       if (upper_bound < lower_bound) /* Prob: 74% */
 149         /* Return an arbitrary value < LOWER_BOUND.  */
 150         return 0.0;
 151
 152 #if CHAR_BIT <= 8
 153       /* When X and Y are both small, avoid the overhead of setting up an
 154          array of size 256.  */
 155       if (xvec_length + yvec_length >= 20) /* Prob: 99% */
 156         {
 157           /* Compute a less quick upper bound.
 158              Each edit is an insertion or deletion of a character, hence
 159              modifies the occurrence count of a character by 1 and leaves the
 160              other occurrence counts unchanged.
 161              Therefore, when starting from a sequence X and ending at a
 162              sequence Y, and denoting the occurrence count of C in X with
 163              OCC (X, C), with N edits,
 164                sum_C | OCC (X, C) - OCC (Y, C) | <= N.
 165              (Proof by induction over N.)
 166              So, at the end, we will have
 167                edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |,
 168              and hence
 169                result
 170                  = (xvec_length + yvec_length - edit_count)
 171                    / (xvec_length + yvec_length)
 172                  <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |)
 173                     / (xvec_length + yvec_length).
 174            */
 175           int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */
 176           int sum;
 177
 178           /* Determine the occurrence counts in X.  */
 179           memset (occ_diff, 0, sizeof (occ_diff));
 180           for (i = xvec_length - 1; i >= 0; i--)
 181             occ_diff[(unsigned char) string1[i]]++;
 182           /* Subtract the occurrence counts in Y.  */
 183           for (i = yvec_length - 1; i >= 0; i--)
 184             occ_diff[(unsigned char) string2[i]]--;
 185           /* Sum up the absolute values.  */
 186           sum = 0;
 187           for (i = 0; i <= UCHAR_MAX; i++)
 188             {
 189               int d = occ_diff[i];
 190               sum += (d >= 0 ? d : -d);
 191             }
 192
 193           upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length);
 194
 195           if (upper_bound < lower_bound) /* Prob: 66% */
 196             /* Return an arbitrary value < LOWER_BOUND.  */
 197             return 0.0;
 198         }
 199 #endif
 200     }
 201
 202   /* set the info for each string.  */
 203   ctxt.xvec = string1;
 204   ctxt.yvec = string2;
 205
 206   /* Set TOO_EXPENSIVE to be approximate square root of input size,
 207      bounded below by 256.  */
 208   ctxt.too_expensive = 1;
 209   for (i = xvec_length + yvec_length;
 210        i != 0;
 211        i >>= 2)
 212     ctxt.too_expensive <<= 1;
 213   if (ctxt.too_expensive < 256)
 214     ctxt.too_expensive = 256;
 215
 216   /* Allocate memory for fdiag and bdiag from a thread-local pool.  */
 217   fdiag_len = xvec_length + yvec_length + 3;
 218   gl_once (keys_init_once, keys_init);
 219   buffer = (int *) gl_tls_get (buffer_key);
 220   bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
 221   if (fdiag_len > bufmax)
 222     {
 223       /* Need more memory.  */
 224       bufmax = 2 * bufmax;
 225       if (fdiag_len > bufmax)
 226         bufmax = fdiag_len;
 227       /* Calling xrealloc would be a waste: buffer's contents does not need
 228          to be preserved.  */
 229       if (buffer != NULL)
 230         free (buffer);
 231       buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int));
 232       gl_tls_set (buffer_key, buffer);
 233       gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
 234     }
 235   ctxt.fdiag = buffer + yvec_length + 1;
 236   ctxt.bdiag = ctxt.fdiag + fdiag_len;
 237
 238   /* The edit_count is only ever increased.  The computation can be aborted
 239      when
 240        (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length)
 241        < lower_bound,
 242      or equivalently
 243        edit_count > (xvec_length + yvec_length) * (1 - lower_bound)
 244      or equivalently
 245        edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)).
 246      We need to add an epsilon inside the floor(...) argument, to neutralize
 247      rounding errors.  */
 248   ctxt.edit_count_limit =
 249     (lower_bound < 1.0
 250      ? (int) ((xvec_length + yvec_length) * (1.0 - lower_bound + 0.000001))
 251      : 0);
 252
 253   /* Now do the main comparison algorithm */
 254   ctxt.edit_count = - ctxt.edit_count_limit;
 255   if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */
 256     /* The edit_count passed the limit.  Hence the result would be
 257        < lower_bound.  We can return any value < lower_bound instead.  */
 258     return 0.0;
 259   ctxt.edit_count += ctxt.edit_count_limit;
 260
 261   /* The result is
 262         ((number of chars in common) / (average length of the strings)).
 263      The numerator is
 264         = xvec_length - (number of calls to NOTE_DELETE)
 265         = yvec_length - (number of calls to NOTE_INSERT)
 266         = 1/2 * (xvec_length + yvec_length - (number of edits)).
 267      This is admittedly biased towards finding that the strings are
 268      similar, however it does produce meaningful results.  */
 269   return ((double) (xvec_length + yvec_length - ctxt.edit_count)
 270           / (xvec_length + yvec_length));
 271 }