lib/fstrcmp.c

   1 /* Functions to make fuzzy comparisons between strings
   2    Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2 of the License, or (at
   7    your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful, but
  10    WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18
  19    Derived from GNU diff 2.7, analyze.c et al.
  20
  21    The basic idea is to consider two strings as similar if, when
  22    transforming the first string into the second string through a
  23    sequence of edits (inserts and deletes of one character each),
  24    this sequence is short - or equivalently, if the ordered list
  25    of characters that are untouched by these edits is long.  For a
  26    good introduction to the subject, read about the "Levenshtein
  27    distance" in Wikipedia.
  28
  29    The basic algorithm is described in:
  30    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
  31    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
  32    see especially section 4.2, which describes the variation used below.
  33
  34    The basic algorithm was independently discovered as described in:
  35    "Algorithms for Approximate String Matching", E. Ukkonen,
  36    Information and Control Vol. 64, 1985, pp. 100-118.
  37
  38    Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
  39    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
  40    at the price of producing suboptimal output for large inputs with
  41    many differences.
  42
  43    Modified to work on strings rather than files
  44    by Peter Miller <pmiller@agso.gov.au>, October 1995 */
  45
  46 #include <config.h>
  47
  48 /* Specification.  */
  49 #include "fstrcmp.h"
  50
  51 #include <string.h>
  52 #include <stdio.h>
  53 #include <stdlib.h>
  54 #include <limits.h>
  55
  56 #include "lock.h"
  57 #include "tls.h"
  58 #include "xalloc.h"
  59
  60 #ifndef uintptr_t
  61 # define uintptr_t unsigned long
  62 #endif
  63
  64
  65 /*
  66  * Context of comparison operation.
  67  */
  68 struct context
  69 {
  70   /*
  71    * Data on one input string being compared.
  72    */
  73   struct string_data
  74   {
  75     /* The string to be compared. */
  76     const char *data;
  77
  78     /* The length of the string to be compared. */
  79     int data_length;
  80
  81     /* The number of characters inserted or deleted. */
  82     int edit_count;
  83   }
  84   string[2];
  85
  86   #ifdef MINUS_H_FLAG
  87
  88   /* This corresponds to the diff -H flag.  With this heuristic, for
  89      strings with a constant small density of changes, the algorithm is
  90      linear in the strings size.  This is unlikely in typical uses of
  91      fstrcmp, and so is usually compiled out.  Besides, there is no
  92      interface to set it true.  */
  93   int heuristic;
  94
  95   #endif
  96
  97   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
  98      point furthest along the given diagonal in the forward search of the
  99      edit matrix.  */
 100   int *fdiag;
 101
 102   /* Vector, indexed by diagonal, containing the X coordinate of the point
 103      furthest along the given diagonal in the backward search of the edit
 104      matrix.  */
 105   int *bdiag;
 106
 107   /* Edit scripts longer than this are too expensive to compute.  */
 108   int too_expensive;
 109
 110   /* Snakes bigger than this are considered `big'.  */
 111   #define SNAKE_LIMIT   20
 112 };
 113
 114 struct partition
 115 {
 116   /* Midpoints of this partition.  */
 117   int xmid, ymid;
 118
 119   /* Nonzero if low half will be analyzed minimally.  */
 120   int lo_minimal;
 121
 122   /* Likewise for high half.  */
 123   int hi_minimal;
 124 };
 125
 126
 127 /* NAME
 128         diag - find diagonal path
 129
 130    SYNOPSIS
 131         int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
 132                  struct partition *part, struct context *ctxt);
 133
 134    DESCRIPTION
 135         Find the midpoint of the shortest edit script for a specified
 136         portion of the two strings.
 137
 138         Scan from the beginnings of the strings, and simultaneously from
 139         the ends, doing a breadth-first search through the space of
 140         edit-sequence.  When the two searches meet, we have found the
 141         midpoint of the shortest edit sequence.
 142
 143         If MINIMAL is nonzero, find the minimal edit script regardless
 144         of expense.  Otherwise, if the search is too expensive, use
 145         heuristics to stop the search and report a suboptimal answer.
 146
 147    RETURNS
 148         Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
 149         number XMID - YMID equals the number of inserted characters
 150         minus the number of deleted characters (counting only characters
 151         before the midpoint).  Return the approximate edit cost; this is
 152         the total number of characters inserted or deleted (counting
 153         only characters before the midpoint), unless a heuristic is used
 154         to terminate the search prematurely.
 155
 156         Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
 157         for the left half of the partition is known; similarly for
 158         PART->RIGHT_MINIMAL.
 159
 160    CAVEAT
 161         This function assumes that the first characters of the specified
 162         portions of the two strings do not match, and likewise that the
 163         last characters do not match.  The caller must trim matching
 164         characters from the beginning and end of the portions it is
 165         going to specify.
 166
 167         If we return the "wrong" partitions, the worst this can do is
 168         cause suboptimal diff output.  It cannot cause incorrect diff
 169         output.  */
 170
 171 static int
 172 diag (int xoff, int xlim, int yoff, int ylim, int minimal,
 173       struct partition *part, struct context *ctxt)
 174 {
 175   int *const fd = ctxt->fdiag;  /* Give the compiler a chance. */
 176   int *const bd = ctxt->bdiag;  /* Additional help for the compiler. */
 177   const char *const xv = ctxt->string[0].data;  /* Still more help for the compiler. */
 178   const char *const yv = ctxt->string[1].data;  /* And more and more . . . */
 179   const int dmin = xoff - ylim; /* Minimum valid diagonal. */
 180   const int dmax = xlim - yoff; /* Maximum valid diagonal. */
 181   const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
 182   const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
 183   int fmin = fmid;
 184   int fmax = fmid;              /* Limits of top-down search. */
 185   int bmin = bmid;
 186   int bmax = bmid;              /* Limits of bottom-up search. */
 187   int c;                        /* Cost. */
 188   int odd = (fmid - bmid) & 1;
 189
 190   /*
 191    * True if southeast corner is on an odd diagonal with respect
 192    * to the northwest.
 193    */
 194   fd[fmid] = xoff;
 195   bd[bmid] = xlim;
 196   for (c = 1;; ++c)
 197     {
 198       int d;                    /* Active diagonal. */
 199       int big_snake;
 200
 201       big_snake = 0;
 202       /* Extend the top-down search by an edit step in each diagonal. */
 203       if (fmin > dmin)
 204         fd[--fmin - 1] = -1;
 205       else
 206         ++fmin;
 207       if (fmax < dmax)
 208         fd[++fmax + 1] = -1;
 209       else
 210         --fmax;
 211       for (d = fmax; d >= fmin; d -= 2)
 212         {
 213           int x;
 214           int y;
 215           int oldx;
 216           int tlo;
 217           int thi;
 218
 219           tlo = fd[d - 1],
 220             thi = fd[d + 1];
 221
 222           if (tlo >= thi)
 223             x = tlo + 1;
 224           else
 225             x = thi;
 226           oldx = x;
 227           y = x - d;
 228           while (x < xlim && y < ylim && xv[x] == yv[y])
 229             {
 230               ++x;
 231               ++y;
 232             }
 233           if (x - oldx > SNAKE_LIMIT)
 234             big_snake = 1;
 235           fd[d] = x;
 236           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
 237             {
 238               part->xmid = x;
 239               part->ymid = y;
 240               part->lo_minimal = part->hi_minimal = 1;
 241               return 2 * c - 1;
 242             }
 243         }
 244       /* Similarly extend the bottom-up search.  */
 245       if (bmin > dmin)
 246         bd[--bmin - 1] = INT_MAX;
 247       else
 248         ++bmin;
 249       if (bmax < dmax)
 250         bd[++bmax + 1] = INT_MAX;
 251       else
 252         --bmax;
 253       for (d = bmax; d >= bmin; d -= 2)
 254         {
 255           int x;
 256           int y;
 257           int oldx;
 258           int tlo;
 259           int thi;
 260
 261           tlo = bd[d - 1],
 262             thi = bd[d + 1];
 263           if (tlo < thi)
 264             x = tlo;
 265           else
 266             x = thi - 1;
 267           oldx = x;
 268           y = x - d;
 269           while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
 270             {
 271               --x;
 272               --y;
 273             }
 274           if (oldx - x > SNAKE_LIMIT)
 275             big_snake = 1;
 276           bd[d] = x;
 277           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
 278             {
 279               part->xmid = x;
 280               part->ymid = y;
 281               part->lo_minimal = part->hi_minimal = 1;
 282               return 2 * c;
 283             }
 284         }
 285
 286       if (minimal)
 287         continue;
 288
 289 #ifdef MINUS_H_FLAG
 290       /* Heuristic: check occasionally for a diagonal that has made lots
 291          of progress compared with the edit distance.  If we have any
 292          such, find the one that has made the most progress and return
 293          it as if it had succeeded.
 294
 295          With this heuristic, for strings with a constant small density
 296          of changes, the algorithm is linear in the strings size.  */
 297       if (c > 200 && big_snake && ctxt->heuristic)
 298         {
 299           int best;
 300
 301           best = 0;
 302           for (d = fmax; d >= fmin; d -= 2)
 303             {
 304               int dd;
 305               int x;
 306               int y;
 307               int v;
 308
 309               dd = d - fmid;
 310               x = fd[d];
 311               y = x - d;
 312               v = (x - xoff) * 2 - dd;
 313
 314               if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 315                 {
 316                   if
 317                     (
 318                       v > best
 319                       &&
 320                       xoff + SNAKE_LIMIT <= x
 321                       &&
 322                       x < xlim
 323                       &&
 324                       yoff + SNAKE_LIMIT <= y
 325                       &&
 326                       y < ylim
 327                     )
 328                     {
 329                       /* We have a good enough best diagonal; now insist
 330                          that it end with a significant snake.  */
 331                       int k;
 332
 333                       for (k = 1; xv[x - k] == yv[y - k]; k++)
 334                         {
 335                           if (k == SNAKE_LIMIT)
 336                             {
 337                               best = v;
 338                               part->xmid = x;
 339                               part->ymid = y;
 340                               break;
 341                             }
 342                         }
 343                     }
 344                 }
 345             }
 346           if (best > 0)
 347             {
 348               part->lo_minimal = 1;
 349               part->hi_minimal = 0;
 350               return 2 * c - 1;
 351             }
 352           best = 0;
 353           for (d = bmax; d >= bmin; d -= 2)
 354             {
 355               int dd;
 356               int x;
 357               int y;
 358               int v;
 359
 360               dd = d - bmid;
 361               x = bd[d];
 362               y = x - d;
 363               v = (xlim - x) * 2 + dd;
 364
 365               if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 366                 {
 367                   if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
 368                       yoff < y && y <= ylim - SNAKE_LIMIT)
 369                     {
 370                       /* We have a good enough best diagonal; now insist
 371                          that it end with a significant snake.  */
 372                       int k;
 373
 374                       for (k = 0; xv[x + k] == yv[y + k]; k++)
 375                         {
 376                           if (k == SNAKE_LIMIT - 1)
 377                             {
 378                               best = v;
 379                               part->xmid = x;
 380                               part->ymid = y;
 381                               break;
 382                             }
 383                         }
 384                     }
 385                 }
 386             }
 387           if (best > 0)
 388             {
 389               part->lo_minimal = 0;
 390               part->hi_minimal = 1;
 391               return 2 * c - 1;
 392             }
 393         }
 394 #endif /* MINUS_H_FLAG */
 395
 396       /* Heuristic: if we've gone well beyond the call of duty, give up
 397          and report halfway between our best results so far.  */
 398       if (c >= ctxt->too_expensive)
 399         {
 400           int fxybest;
 401           int fxbest;
 402           int bxybest;
 403           int bxbest;
 404
 405           /* Pacify `gcc -Wall'. */
 406           fxbest = 0;
 407           bxbest = 0;
 408
 409           /* Find forward diagonal that maximizes X + Y.  */
 410           fxybest = -1;
 411           for (d = fmax; d >= fmin; d -= 2)
 412             {
 413               int x;
 414               int y;
 415
 416               x = fd[d] < xlim ? fd[d] : xlim;
 417               y = x - d;
 418
 419               if (ylim < y)
 420                 {
 421                   x = ylim + d;
 422                   y = ylim;
 423                 }
 424               if (fxybest < x + y)
 425                 {
 426                   fxybest = x + y;
 427                   fxbest = x;
 428                 }
 429             }
 430           /* Find backward diagonal that minimizes X + Y.  */
 431           bxybest = INT_MAX;
 432           for (d = bmax; d >= bmin; d -= 2)
 433             {
 434               int x;
 435               int y;
 436
 437               x = xoff > bd[d] ? xoff : bd[d];
 438               y = x - d;
 439
 440               if (y < yoff)
 441                 {
 442                   x = yoff + d;
 443                   y = yoff;
 444                 }
 445               if (x + y < bxybest)
 446                 {
 447                   bxybest = x + y;
 448                   bxbest = x;
 449                 }
 450             }
 451           /* Use the better of the two diagonals.  */
 452           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
 453             {
 454               part->xmid = fxbest;
 455               part->ymid = fxybest - fxbest;
 456               part->lo_minimal = 1;
 457               part->hi_minimal = 0;
 458             }
 459           else
 460             {
 461               part->xmid = bxbest;
 462               part->ymid = bxybest - bxbest;
 463               part->lo_minimal = 0;
 464               part->hi_minimal = 1;
 465             }
 466           return 2 * c - 1;
 467         }
 468     }
 469 }
 470
 471
 472 /* NAME
 473         compareseq - find edit sequence
 474
 475    SYNOPSIS
 476         void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal,
 477                         struct context *ctxt);
 478
 479    DESCRIPTION
 480         Compare in detail contiguous subsequences of the two strings
 481         which are known, as a whole, to match each other.
 482
 483         The subsequence of string 0 is [XOFF, XLIM) and likewise for
 484         string 1.
 485
 486         Note that XLIM, YLIM are exclusive bounds.  All character
 487         numbers are origin-0.
 488
 489         If MINIMAL is nonzero, find a minimal difference no matter how
 490         expensive it is.  */
 491
 492 static void
 493 compareseq (int xoff, int xlim, int yoff, int ylim, int minimal,
 494             struct context *ctxt)
 495 {
 496   const char *const xv = ctxt->string[0].data;  /* Help the compiler.  */
 497   const char *const yv = ctxt->string[1].data;
 498
 499   /* Slide down the bottom initial diagonal. */
 500   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
 501     {
 502       ++xoff;
 503       ++yoff;
 504     }
 505
 506   /* Slide up the top initial diagonal. */
 507   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
 508     {
 509       --xlim;
 510       --ylim;
 511     }
 512
 513   /* Handle simple cases. */
 514   if (xoff == xlim)
 515     {
 516       while (yoff < ylim)
 517         {
 518           ctxt->string[1].edit_count++;
 519           ++yoff;
 520         }
 521     }
 522   else if (yoff == ylim)
 523     {
 524       while (xoff < xlim)
 525         {
 526           ctxt->string[0].edit_count++;
 527           ++xoff;
 528         }
 529     }
 530   else
 531     {
 532       int c;
 533       struct partition part;
 534
 535       /* Find a point of correspondence in the middle of the strings.  */
 536       c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt);
 537       if (c == 1)
 538         {
 539 #if 0
 540           /* This should be impossible, because it implies that one of
 541              the two subsequences is empty, and that case was handled
 542              above without calling `diag'.  Let's verify that this is
 543              true.  */
 544           abort ();
 545 #else
 546           /* The two subsequences differ by a single insert or delete;
 547              record it and we are done.  */
 548           if (part.xmid - part.ymid < xoff - yoff)
 549             ctxt->string[1].edit_count++;
 550           else
 551             ctxt->string[0].edit_count++;
 552 #endif
 553         }
 554       else
 555         {
 556           /* Use the partitions to split this problem into subproblems.  */
 557           compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
 558           compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
 559         }
 560     }
 561 }
 562
 563
 564 /* Because fstrcmp is typically called multiple times, attempt to minimize
 565    the number of memory allocations performed.  Thus, let a call reuse the
 566    memory already allocated by the previous call, if it is sufficient.
 567    To make it multithread-safe, without need for a lock that protects the
 568    already allocated memory, store the allocated memory per thread.  Free
 569    it only when the thread exits.  */
 570
 571 static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */
 572 static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */
 573
 574 static void
 575 keys_init (void)
 576 {
 577   gl_tls_key_init (buffer_key, free);
 578   gl_tls_key_init (bufmax_key, NULL);
 579   /* The per-thread initial values are NULL and 0, respectively.  */
 580 }
 581
 582 /* Ensure that keys_init is called once only.  */
 583 gl_once_define(static, keys_init_once);
 584
 585
 586 /* NAME
 587         fstrcmp - fuzzy string compare
 588
 589    SYNOPSIS
 590         double fstrcmp(const char *, const char *);
 591
 592    DESCRIPTION
 593         The fstrcmp function may be used to compare two string for
 594         similarity.  It is very useful in reducing "cascade" or
 595         "secondary" errors in compilers or other situations where
 596         symbol tables occur.
 597
 598    RETURNS
 599         double; 0 if the strings are entirly dissimilar, 1 if the
 600         strings are identical, and a number in between if they are
 601         similar.  */
 602
 603 double
 604 fstrcmp (const char *string1, const char *string2)
 605 {
 606   struct context ctxt;
 607   int i;
 608
 609   size_t fdiag_len;
 610   int *buffer;
 611   size_t bufmax;
 612
 613   /* set the info for each string.  */
 614   ctxt.string[0].data = string1;
 615   ctxt.string[0].data_length = strlen (string1);
 616   ctxt.string[1].data = string2;
 617   ctxt.string[1].data_length = strlen (string2);
 618
 619   /* short-circuit obvious comparisons */
 620   if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0)
 621     return 1.0;
 622   if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0)
 623     return 0.0;
 624
 625   /* Set TOO_EXPENSIVE to be approximate square root of input size,
 626      bounded below by 256.  */
 627   ctxt.too_expensive = 1;
 628   for (i = ctxt.string[0].data_length + ctxt.string[1].data_length;
 629        i != 0;
 630        i >>= 2)
 631     ctxt.too_expensive <<= 1;
 632   if (ctxt.too_expensive < 256)
 633     ctxt.too_expensive = 256;
 634
 635   /* Allocate memory for fdiag and bdiag from a thread-local pool.  */
 636   fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3;
 637   gl_once (keys_init_once, keys_init);
 638   buffer = (int *) gl_tls_get (buffer_key);
 639   bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
 640   if (fdiag_len > bufmax)
 641     {
 642       /* Need more memory.  */
 643       bufmax = 2 * bufmax;
 644       if (fdiag_len > bufmax)
 645         bufmax = fdiag_len;
 646       /* Calling xrealloc would be a waste: buffer's contents does not need
 647          to be preserved.  */
 648       if (buffer != NULL)
 649         free (buffer);
 650       buffer = (int *) xmalloc (bufmax * (2 * sizeof (int)));
 651       gl_tls_set (buffer_key, buffer);
 652       gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
 653     }
 654   ctxt.fdiag = buffer + ctxt.string[1].data_length + 1;
 655   ctxt.bdiag = ctxt.fdiag + fdiag_len;
 656
 657   /* Now do the main comparison algorithm */
 658   ctxt.string[0].edit_count = 0;
 659   ctxt.string[1].edit_count = 0;
 660   compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0,
 661               &ctxt);
 662
 663   /* The result is
 664         ((number of chars in common) / (average length of the strings)).
 665      This is admittedly biased towards finding that the strings are
 666      similar, however it does produce meaningful results.  */
 667   return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length
 668                     - ctxt.string[1].edit_count - ctxt.string[0].edit_count)
 669           / (ctxt.string[0].data_length + ctxt.string[1].data_length));
 670 }