1 /* Analyze differences between two vectors.
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2008 Free
4 Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
30 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
31 see especially section 4.2, which describes the variation used below.
33 The basic algorithm was independently discovered as described in:
34 "Algorithms for Approximate String Matching", E. Ukkonen,
35 Information and Control Vol. 64, 1985, pp. 100-118.
37 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
38 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
39 at the price of producing suboptimal output for large inputs with
42 /* Before including this file, you need to define:
43 ELEMENT The element type of the vectors being compared.
44 EQUAL A two-argument macro that tests two elements for
46 OFFSET A signed integer type sufficient to hold the
47 difference between two indices. Usually
48 something like ssize_t.
49 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
50 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
51 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
52 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
53 early abort of the computation.
54 USE_HEURISTIC (Optional) Define if you want to support the
55 heuristic for large vectors.
56 Before including this file, you also need to include:
62 /* Maximum value of type OFFSET. */
64 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
66 /* Default to no early abort. */
68 # define EARLY_ABORT(ctxt) false
71 /* Use this to suppress gcc's `...may be used before initialized' warnings. */
74 # define IF_LINT(Code) Code
76 # define IF_LINT(Code) /* empty */
81 * Context of comparison operation.
85 /* Vectors being compared. */
92 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
93 furthest along the given diagonal in the forward search of the edit
97 /* Vector, indexed by diagonal, containing the X coordinate of the point
98 furthest along the given diagonal in the backward search of the edit
103 /* This corresponds to the diff -H flag. With this heuristic, for
104 vectors with a constant small density of changes, the algorithm is
105 linear in the vectors size. */
109 /* Edit scripts longer than this are too expensive to compute. */
110 OFFSET too_expensive;
112 /* Snakes bigger than this are considered `big'. */
113 #define SNAKE_LIMIT 20
118 /* Midpoints of this partition. */
122 /* True if low half will be analyzed minimally. */
125 /* Likewise for high half. */
130 /* Find the midpoint of the shortest edit script for a specified portion
133 Scan from the beginnings of the vectors, and simultaneously from the ends,
134 doing a breadth-first search through the space of edit-sequence.
135 When the two searches meet, we have found the midpoint of the shortest
138 If FIND_MINIMAL is true, find the minimal edit script regardless of
139 expense. Otherwise, if the search is too expensive, use heuristics to
140 stop the search and report a suboptimal answer.
142 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
143 XMID - YMID equals the number of inserted elements minus the number
144 of deleted elements (counting only elements before the midpoint).
146 Set PART->lo_minimal to true iff the minimal edit script for the
147 left half of the partition is known; similarly for PART->hi_minimal.
149 This function assumes that the first elements of the specified portions
150 of the two vectors do not match, and likewise that the last elements do not
151 match. The caller must trim matching elements from the beginning and end
152 of the portions it is going to specify.
154 If we return the "wrong" partitions, the worst this can do is cause
155 suboptimal diff output. It cannot cause incorrect diff output. */
158 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
159 struct partition *part, struct context *ctxt)
161 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
162 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
163 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
164 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
165 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
166 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
167 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
168 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
170 OFFSET fmax = fmid; /* Limits of top-down search. */
172 OFFSET bmax = bmid; /* Limits of bottom-up search. */
173 OFFSET c; /* Cost. */
174 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
175 diagonal with respect to the northwest. */
182 OFFSET d; /* Active diagonal. */
183 bool big_snake = false;
185 /* Extend the top-down search by an edit step in each diagonal. */
194 for (d = fmax; d >= fmin; d -= 2)
198 OFFSET tlo = fd[d - 1];
199 OFFSET thi = fd[d + 1];
200 OFFSET x0 = tlo < thi ? thi : tlo + 1;
202 for (x = x0, y = x0 - d;
203 x < xlim && y < ylim && EQUAL (xv[x], yv[y]);
206 if (x - x0 > SNAKE_LIMIT)
209 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
213 part->lo_minimal = part->hi_minimal = true;
218 /* Similarly extend the bottom-up search. */
220 bd[--bmin - 1] = OFFSET_MAX;
224 bd[++bmax + 1] = OFFSET_MAX;
227 for (d = bmax; d >= bmin; d -= 2)
231 OFFSET tlo = bd[d - 1];
232 OFFSET thi = bd[d + 1];
233 OFFSET x0 = tlo < thi ? tlo : thi - 1;
235 for (x = x0, y = x0 - d;
236 xoff < x && yoff < y && EQUAL (xv[x - 1], yv[y - 1]);
239 if (x0 - x > SNAKE_LIMIT)
242 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
246 part->lo_minimal = part->hi_minimal = true;
255 /* Heuristic: check occasionally for a diagonal that has made lots
256 of progress compared with the edit distance. If we have any
257 such, find the one that has made the most progress and return it
258 as if it had succeeded.
260 With this heuristic, for vectors with a constant small density
261 of changes, the algorithm is linear in the vector size. */
263 if (200 < c && big_snake && ctxt->heuristic)
267 for (d = fmax; d >= fmin; d -= 2)
269 OFFSET dd = d - fmid;
272 OFFSET v = (x - xoff) * 2 - dd;
274 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
277 && xoff + SNAKE_LIMIT <= x && x < xlim
278 && yoff + SNAKE_LIMIT <= y && y < ylim)
280 /* We have a good enough best diagonal; now insist
281 that it end with a significant snake. */
284 for (k = 1; EQUAL (xv[x - k], yv[y - k]); k++)
285 if (k == SNAKE_LIMIT)
297 part->lo_minimal = true;
298 part->hi_minimal = false;
303 for (d = bmax; d >= bmin; d -= 2)
305 OFFSET dd = d - bmid;
308 OFFSET v = (xlim - x) * 2 + dd;
310 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
313 && xoff < x && x <= xlim - SNAKE_LIMIT
314 && yoff < y && y <= ylim - SNAKE_LIMIT)
316 /* We have a good enough best diagonal; now insist
317 that it end with a significant snake. */
320 for (k = 0; EQUAL (xv[x + k], yv[y + k]); k++)
321 if (k == SNAKE_LIMIT - 1)
333 part->lo_minimal = false;
334 part->hi_minimal = true;
338 #endif /* USE_HEURISTIC */
340 /* Heuristic: if we've gone well beyond the call of duty, give up
341 and report halfway between our best results so far. */
342 if (c >= ctxt->too_expensive)
345 OFFSET fxbest IF_LINT (= 0);
347 OFFSET bxbest IF_LINT (= 0);
349 /* Find forward diagonal that maximizes X + Y. */
351 for (d = fmax; d >= fmin; d -= 2)
353 OFFSET x = MIN (fd[d], xlim);
367 /* Find backward diagonal that minimizes X + Y. */
368 bxybest = OFFSET_MAX;
369 for (d = bmax; d >= bmin; d -= 2)
371 OFFSET x = MAX (xoff, bd[d]);
385 /* Use the better of the two diagonals. */
386 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
389 part->ymid = fxybest - fxbest;
390 part->lo_minimal = true;
391 part->hi_minimal = false;
396 part->ymid = bxybest - bxbest;
397 part->lo_minimal = false;
398 part->hi_minimal = true;
406 /* Compare in detail contiguous subsequences of the two vectors
407 which are known, as a whole, to match each other.
409 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
411 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
414 If FIND_MINIMAL, find a minimal difference no matter how
417 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
419 Return false if terminated normally, or true if terminated through early
423 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
424 bool find_minimal, struct context *ctxt)
426 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
427 ELEMENT const *yv = ctxt->yvec;
429 /* Slide down the bottom initial diagonal. */
430 while (xoff < xlim && yoff < ylim && EQUAL (xv[xoff], yv[yoff]))
436 /* Slide up the top initial diagonal. */
437 while (xoff < xlim && yoff < ylim && EQUAL (xv[xlim - 1], yv[ylim - 1]))
443 /* Handle simple cases. */
447 NOTE_INSERT (ctxt, yoff);
448 if (EARLY_ABORT (ctxt))
452 else if (yoff == ylim)
455 NOTE_DELETE (ctxt, xoff);
456 if (EARLY_ABORT (ctxt))
462 struct partition part;
464 /* Find a point of correspondence in the middle of the vectors. */
465 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
467 /* Use the partitions to split this problem into subproblems. */
468 if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
470 if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
480 #undef EXTRA_CONTEXT_FIELDS