1 /* Functions to make fuzzy comparisons between strings
2 Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or (at
7 your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 Derived from GNU diff 2.7, analyze.c et al.
21 The basic idea is to consider two strings as similar if, when
22 transforming the first string into the second string through a
23 sequence of edits (inserts and deletes of one character each),
24 this sequence is short - or equivalently, if the ordered list
25 of characters that are untouched by these edits is long. For a
26 good introduction to the subject, read about the "Levenshtein
27 distance" in Wikipedia.
29 The basic algorithm is described in:
30 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
31 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
32 see especially section 4.2, which describes the variation used below.
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", E. Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118.
38 Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
39 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
40 at the price of producing suboptimal output for large inputs with
43 Modified to work on strings rather than files
44 by Peter Miller <pmiller@agso.gov.au>, October 1995 */
61 # define uintptr_t unsigned long
66 * Context of comparison operation.
71 * Data on one input string being compared.
75 /* The string to be compared. */
78 /* The length of the string to be compared. */
81 /* The number of characters inserted or deleted. */
88 /* This corresponds to the diff -H flag. With this heuristic, for
89 strings with a constant small density of changes, the algorithm is
90 linear in the strings size. This is unlikely in typical uses of
91 fstrcmp, and so is usually compiled out. Besides, there is no
92 interface to set it true. */
97 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
98 point furthest along the given diagonal in the forward search of the
102 /* Vector, indexed by diagonal, containing the X coordinate of the point
103 furthest along the given diagonal in the backward search of the edit
107 /* Edit scripts longer than this are too expensive to compute. */
110 /* Snakes bigger than this are considered `big'. */
111 #define SNAKE_LIMIT 20
116 /* Midpoints of this partition. */
119 /* Nonzero if low half will be analyzed minimally. */
122 /* Likewise for high half. */
128 diag - find diagonal path
131 int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
132 struct partition *part, struct context *ctxt);
135 Find the midpoint of the shortest edit script for a specified
136 portion of the two strings.
138 Scan from the beginnings of the strings, and simultaneously from
139 the ends, doing a breadth-first search through the space of
140 edit-sequence. When the two searches meet, we have found the
141 midpoint of the shortest edit sequence.
143 If MINIMAL is nonzero, find the minimal edit script regardless
144 of expense. Otherwise, if the search is too expensive, use
145 heuristics to stop the search and report a suboptimal answer.
148 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
149 number XMID - YMID equals the number of inserted characters
150 minus the number of deleted characters (counting only characters
151 before the midpoint). Return the approximate edit cost; this is
152 the total number of characters inserted or deleted (counting
153 only characters before the midpoint), unless a heuristic is used
154 to terminate the search prematurely.
156 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
157 for the left half of the partition is known; similarly for
161 This function assumes that the first characters of the specified
162 portions of the two strings do not match, and likewise that the
163 last characters do not match. The caller must trim matching
164 characters from the beginning and end of the portions it is
167 If we return the "wrong" partitions, the worst this can do is
168 cause suboptimal diff output. It cannot cause incorrect diff
172 diag (int xoff, int xlim, int yoff, int ylim, int minimal,
173 struct partition *part, struct context *ctxt)
175 int *const fd = ctxt->fdiag; /* Give the compiler a chance. */
176 int *const bd = ctxt->bdiag; /* Additional help for the compiler. */
177 const char *const xv = ctxt->string[0].data; /* Still more help for the compiler. */
178 const char *const yv = ctxt->string[1].data; /* And more and more . . . */
179 const int dmin = xoff - ylim; /* Minimum valid diagonal. */
180 const int dmax = xlim - yoff; /* Maximum valid diagonal. */
181 const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
182 const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
184 int fmax = fmid; /* Limits of top-down search. */
186 int bmax = bmid; /* Limits of bottom-up search. */
188 int odd = (fmid - bmid) & 1;
191 * True if southeast corner is on an odd diagonal with respect
198 int d; /* Active diagonal. */
202 /* Extend the top-down search by an edit step in each diagonal. */
211 for (d = fmax; d >= fmin; d -= 2)
228 while (x < xlim && y < ylim && xv[x] == yv[y])
233 if (x - oldx > SNAKE_LIMIT)
236 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
240 part->lo_minimal = part->hi_minimal = 1;
244 /* Similarly extend the bottom-up search. */
246 bd[--bmin - 1] = INT_MAX;
250 bd[++bmax + 1] = INT_MAX;
253 for (d = bmax; d >= bmin; d -= 2)
269 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
274 if (oldx - x > SNAKE_LIMIT)
277 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
281 part->lo_minimal = part->hi_minimal = 1;
290 /* Heuristic: check occasionally for a diagonal that has made lots
291 of progress compared with the edit distance. If we have any
292 such, find the one that has made the most progress and return
293 it as if it had succeeded.
295 With this heuristic, for strings with a constant small density
296 of changes, the algorithm is linear in the strings size. */
297 if (c > 200 && big_snake && ctxt->heuristic)
302 for (d = fmax; d >= fmin; d -= 2)
312 v = (x - xoff) * 2 - dd;
314 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
320 xoff + SNAKE_LIMIT <= x
324 yoff + SNAKE_LIMIT <= y
329 /* We have a good enough best diagonal; now insist
330 that it end with a significant snake. */
333 for (k = 1; xv[x - k] == yv[y - k]; k++)
335 if (k == SNAKE_LIMIT)
348 part->lo_minimal = 1;
349 part->hi_minimal = 0;
353 for (d = bmax; d >= bmin; d -= 2)
363 v = (xlim - x) * 2 + dd;
365 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
367 if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
368 yoff < y && y <= ylim - SNAKE_LIMIT)
370 /* We have a good enough best diagonal; now insist
371 that it end with a significant snake. */
374 for (k = 0; xv[x + k] == yv[y + k]; k++)
376 if (k == SNAKE_LIMIT - 1)
389 part->lo_minimal = 0;
390 part->hi_minimal = 1;
394 #endif /* MINUS_H_FLAG */
396 /* Heuristic: if we've gone well beyond the call of duty, give up
397 and report halfway between our best results so far. */
398 if (c >= ctxt->too_expensive)
405 /* Pacify `gcc -Wall'. */
409 /* Find forward diagonal that maximizes X + Y. */
411 for (d = fmax; d >= fmin; d -= 2)
416 x = fd[d] < xlim ? fd[d] : xlim;
430 /* Find backward diagonal that minimizes X + Y. */
432 for (d = bmax; d >= bmin; d -= 2)
437 x = xoff > bd[d] ? xoff : bd[d];
451 /* Use the better of the two diagonals. */
452 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
455 part->ymid = fxybest - fxbest;
456 part->lo_minimal = 1;
457 part->hi_minimal = 0;
462 part->ymid = bxybest - bxbest;
463 part->lo_minimal = 0;
464 part->hi_minimal = 1;
473 compareseq - find edit sequence
476 void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal,
477 struct context *ctxt);
480 Compare in detail contiguous subsequences of the two strings
481 which are known, as a whole, to match each other.
483 The subsequence of string 0 is [XOFF, XLIM) and likewise for
486 Note that XLIM, YLIM are exclusive bounds. All character
487 numbers are origin-0.
489 If MINIMAL is nonzero, find a minimal difference no matter how
493 compareseq (int xoff, int xlim, int yoff, int ylim, int minimal,
494 struct context *ctxt)
496 const char *const xv = ctxt->string[0].data; /* Help the compiler. */
497 const char *const yv = ctxt->string[1].data;
499 /* Slide down the bottom initial diagonal. */
500 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
506 /* Slide up the top initial diagonal. */
507 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
513 /* Handle simple cases. */
518 ctxt->string[1].edit_count++;
522 else if (yoff == ylim)
526 ctxt->string[0].edit_count++;
533 struct partition part;
535 /* Find a point of correspondence in the middle of the strings. */
536 c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt);
540 /* This should be impossible, because it implies that one of
541 the two subsequences is empty, and that case was handled
542 above without calling `diag'. Let's verify that this is
546 /* The two subsequences differ by a single insert or delete;
547 record it and we are done. */
548 if (part.xmid - part.ymid < xoff - yoff)
549 ctxt->string[1].edit_count++;
551 ctxt->string[0].edit_count++;
556 /* Use the partitions to split this problem into subproblems. */
557 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
558 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
564 /* Because fstrcmp is typically called multiple times, attempt to minimize
565 the number of memory allocations performed. Thus, let a call reuse the
566 memory already allocated by the previous call, if it is sufficient.
567 To make it multithread-safe, without need for a lock that protects the
568 already allocated memory, store the allocated memory per thread. Free
569 it only when the thread exits. */
571 static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */
572 static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */
577 gl_tls_key_init (buffer_key, free);
578 gl_tls_key_init (bufmax_key, NULL);
579 /* The per-thread initial values are NULL and 0, respectively. */
582 /* Ensure that keys_init is called once only. */
583 gl_once_define(static, keys_init_once);
587 fstrcmp - fuzzy string compare
590 double fstrcmp(const char *, const char *);
593 The fstrcmp function may be used to compare two string for
594 similarity. It is very useful in reducing "cascade" or
595 "secondary" errors in compilers or other situations where
599 double; 0 if the strings are entirly dissimilar, 1 if the
600 strings are identical, and a number in between if they are
604 fstrcmp (const char *string1, const char *string2)
613 /* set the info for each string. */
614 ctxt.string[0].data = string1;
615 ctxt.string[0].data_length = strlen (string1);
616 ctxt.string[1].data = string2;
617 ctxt.string[1].data_length = strlen (string2);
619 /* short-circuit obvious comparisons */
620 if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0)
622 if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0)
625 /* Set TOO_EXPENSIVE to be approximate square root of input size,
626 bounded below by 256. */
627 ctxt.too_expensive = 1;
628 for (i = ctxt.string[0].data_length + ctxt.string[1].data_length;
631 ctxt.too_expensive <<= 1;
632 if (ctxt.too_expensive < 256)
633 ctxt.too_expensive = 256;
635 /* Allocate memory for fdiag and bdiag from a thread-local pool. */
636 fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3;
637 gl_once (keys_init_once, keys_init);
638 buffer = (int *) gl_tls_get (buffer_key);
639 bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
640 if (fdiag_len > bufmax)
642 /* Need more memory. */
644 if (fdiag_len > bufmax)
646 /* Calling xrealloc would be a waste: buffer's contents does not need
650 buffer = (int *) xmalloc (bufmax * (2 * sizeof (int)));
651 gl_tls_set (buffer_key, buffer);
652 gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
654 ctxt.fdiag = buffer + ctxt.string[1].data_length + 1;
655 ctxt.bdiag = ctxt.fdiag + fdiag_len;
657 /* Now do the main comparison algorithm */
658 ctxt.string[0].edit_count = 0;
659 ctxt.string[1].edit_count = 0;
660 compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0,
664 ((number of chars in common) / (average length of the strings)).
665 This is admittedly biased towards finding that the strings are
666 similar, however it does produce meaningful results. */
667 return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length
668 - ctxt.string[1].edit_count - ctxt.string[0].edit_count)
669 / (ctxt.string[0].data_length + ctxt.string[1].data_length));