1 /* Analyze differences between two vectors.
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2009 Free
4 Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
30 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
31 see especially section 4.2, which describes the variation used below.
33 The basic algorithm was independently discovered as described in:
34 "Algorithms for Approximate String Matching", E. Ukkonen,
35 Information and Control Vol. 64, 1985, pp. 100-118.
37 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
38 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
39 at the price of producing suboptimal output for large inputs with
42 /* Before including this file, you need to define:
43 ELEMENT The element type of the vectors being compared.
44 EQUAL A two-argument macro that tests two elements for
46 OFFSET A signed integer type sufficient to hold the
47 difference between two indices. Usually
48 something like ssize_t.
49 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
50 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
51 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
52 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
53 early abort of the computation.
54 USE_HEURISTIC (Optional) Define if you want to support the
55 heuristic for large vectors.
56 Before including this file, you also need to include:
62 /* Maximum value of type OFFSET. */
64 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
66 /* Default to no early abort. */
68 # define EARLY_ABORT(ctxt) false
71 /* Use this to suppress gcc's `...may be used before initialized' warnings. */
74 # define IF_LINT(Code) Code
76 # define IF_LINT(Code) /* empty */
80 /* As above, but when Code must contain one comma. */
83 # define IF_LINT2(Code1, Code2) Code1, Code2
85 # define IF_LINT2(Code1, Code2) /* empty */
90 * Context of comparison operation.
94 /* Vectors being compared. */
101 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
102 furthest along the given diagonal in the forward search of the edit
106 /* Vector, indexed by diagonal, containing the X coordinate of the point
107 furthest along the given diagonal in the backward search of the edit
112 /* This corresponds to the diff -H flag. With this heuristic, for
113 vectors with a constant small density of changes, the algorithm is
114 linear in the vectors size. */
118 /* Edit scripts longer than this are too expensive to compute. */
119 OFFSET too_expensive;
121 /* Snakes bigger than this are considered `big'. */
122 #define SNAKE_LIMIT 20
127 /* Midpoints of this partition. */
131 /* True if low half will be analyzed minimally. */
134 /* Likewise for high half. */
139 /* Find the midpoint of the shortest edit script for a specified portion
142 Scan from the beginnings of the vectors, and simultaneously from the ends,
143 doing a breadth-first search through the space of edit-sequence.
144 When the two searches meet, we have found the midpoint of the shortest
147 If FIND_MINIMAL is true, find the minimal edit script regardless of
148 expense. Otherwise, if the search is too expensive, use heuristics to
149 stop the search and report a suboptimal answer.
151 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
152 XMID - YMID equals the number of inserted elements minus the number
153 of deleted elements (counting only elements before the midpoint).
155 Set PART->lo_minimal to true iff the minimal edit script for the
156 left half of the partition is known; similarly for PART->hi_minimal.
158 This function assumes that the first elements of the specified portions
159 of the two vectors do not match, and likewise that the last elements do not
160 match. The caller must trim matching elements from the beginning and end
161 of the portions it is going to specify.
163 If we return the "wrong" partitions, the worst this can do is cause
164 suboptimal diff output. It cannot cause incorrect diff output. */
167 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
168 struct partition *part, struct context *ctxt)
170 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
171 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
172 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
173 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
174 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
175 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
176 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
177 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
179 OFFSET fmax = fmid; /* Limits of top-down search. */
181 OFFSET bmax = bmid; /* Limits of bottom-up search. */
182 OFFSET c; /* Cost. */
183 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
184 diagonal with respect to the northwest. */
191 OFFSET d; /* Active diagonal. */
192 bool big_snake = false;
194 /* Extend the top-down search by an edit step in each diagonal. */
203 for (d = fmax; d >= fmin; d -= 2)
207 OFFSET tlo = fd[d - 1];
208 OFFSET thi = fd[d + 1];
209 OFFSET x0 = tlo < thi ? thi : tlo + 1;
211 for (x = x0, y = x0 - d;
212 x < xlim && y < ylim && EQUAL (xv[x], yv[y]);
215 if (x - x0 > SNAKE_LIMIT)
218 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
222 part->lo_minimal = part->hi_minimal = true;
227 /* Similarly extend the bottom-up search. */
229 bd[--bmin - 1] = OFFSET_MAX;
233 bd[++bmax + 1] = OFFSET_MAX;
236 for (d = bmax; d >= bmin; d -= 2)
240 OFFSET tlo = bd[d - 1];
241 OFFSET thi = bd[d + 1];
242 OFFSET x0 = tlo < thi ? tlo : thi - 1;
244 for (x = x0, y = x0 - d;
245 xoff < x && yoff < y && EQUAL (xv[x - 1], yv[y - 1]);
248 if (x0 - x > SNAKE_LIMIT)
251 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
255 part->lo_minimal = part->hi_minimal = true;
264 /* Heuristic: check occasionally for a diagonal that has made lots
265 of progress compared with the edit distance. If we have any
266 such, find the one that has made the most progress and return it
267 as if it had succeeded.
269 With this heuristic, for vectors with a constant small density
270 of changes, the algorithm is linear in the vector size. */
272 if (200 < c && big_snake && ctxt->heuristic)
277 for (d = fmax; d >= fmin; d -= 2)
279 OFFSET dd = d - fmid;
282 OFFSET v = (x - xoff) * 2 - dd;
284 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
287 && xoff + SNAKE_LIMIT <= x && x < xlim
288 && yoff + SNAKE_LIMIT <= y && y < ylim)
290 /* We have a good enough best diagonal; now insist
291 that it end with a significant snake. */
294 for (k = 1; EQUAL (xv[x - k], yv[y - k]); k++)
295 if (k == SNAKE_LIMIT)
307 part->lo_minimal = true;
308 part->hi_minimal = false;
316 for (d = bmax; d >= bmin; d -= 2)
318 OFFSET dd = d - bmid;
321 OFFSET v = (xlim - x) * 2 + dd;
323 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
326 && xoff < x && x <= xlim - SNAKE_LIMIT
327 && yoff < y && y <= ylim - SNAKE_LIMIT)
329 /* We have a good enough best diagonal; now insist
330 that it end with a significant snake. */
333 for (k = 0; EQUAL (xv[x + k], yv[y + k]); k++)
334 if (k == SNAKE_LIMIT - 1)
346 part->lo_minimal = false;
347 part->hi_minimal = true;
352 #endif /* USE_HEURISTIC */
354 /* Heuristic: if we've gone well beyond the call of duty, give up
355 and report halfway between our best results so far. */
356 if (c >= ctxt->too_expensive)
359 OFFSET fxbest IF_LINT (= 0);
361 OFFSET bxbest IF_LINT (= 0);
363 /* Find forward diagonal that maximizes X + Y. */
365 for (d = fmax; d >= fmin; d -= 2)
367 OFFSET x = MIN (fd[d], xlim);
381 /* Find backward diagonal that minimizes X + Y. */
382 bxybest = OFFSET_MAX;
383 for (d = bmax; d >= bmin; d -= 2)
385 OFFSET x = MAX (xoff, bd[d]);
399 /* Use the better of the two diagonals. */
400 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
403 part->ymid = fxybest - fxbest;
404 part->lo_minimal = true;
405 part->hi_minimal = false;
410 part->ymid = bxybest - bxbest;
411 part->lo_minimal = false;
412 part->hi_minimal = true;
420 /* Compare in detail contiguous subsequences of the two vectors
421 which are known, as a whole, to match each other.
423 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
425 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
428 If FIND_MINIMAL, find a minimal difference no matter how
431 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
433 Return false if terminated normally, or true if terminated through early
437 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
438 bool find_minimal, struct context *ctxt)
440 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
441 ELEMENT const *yv = ctxt->yvec;
443 /* Slide down the bottom initial diagonal. */
444 while (xoff < xlim && yoff < ylim && EQUAL (xv[xoff], yv[yoff]))
450 /* Slide up the top initial diagonal. */
451 while (xoff < xlim && yoff < ylim && EQUAL (xv[xlim - 1], yv[ylim - 1]))
457 /* Handle simple cases. */
461 NOTE_INSERT (ctxt, yoff);
462 if (EARLY_ABORT (ctxt))
466 else if (yoff == ylim)
469 NOTE_DELETE (ctxt, xoff);
470 if (EARLY_ABORT (ctxt))
476 struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
478 /* Find a point of correspondence in the middle of the vectors. */
479 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
481 /* Use the partitions to split this problem into subproblems. */
482 if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
484 if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
494 #undef EXTRA_CONTEXT_FIELDS