X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Ffstrcmp.c;h=f0e51e7276d1ccf8cfa3d8a78323fe3b1ff27944;hb=4565266f776db0d78b2c207a5fe79f929f9806ca;hp=9bd742996863fc783a74e2b8794191720f2ea8dd;hpb=7fef1769f5bc137624077568f1a5016dbcee2fda;p=gnulib.git diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c index 9bd742996..f0e51e727 100644 --- a/lib/fstrcmp.c +++ b/lib/fstrcmp.c @@ -1,28 +1,27 @@ /* Functions to make fuzzy comparisons between strings Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or modify + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or (at - your option) any later version. + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + along with this program. If not, see . Derived from GNU diff 2.7, analyze.c et al. - The basic idea is to consider two strings as similar if, when - transforming the first string into the second string through a - sequence of edits (inserts and deletes of one character each), + The basic idea is to consider two vectors as similar if, when + transforming the first vector into the second vector through a + sequence of edits (inserts and deletes of one element each), this sequence is short - or equivalently, if the ordered list - of characters that are untouched by these edits is long. For a + of elements that are untouched by these edits is long. For a good introduction to the subject, read about the "Levenshtein distance" in Wikipedia. @@ -35,13 +34,10 @@ "Algorithms for Approximate String Matching", E. Ukkonen, Information and Control Vol. 64, 1985, pp. 100-118. - Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE + Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) at the price of producing suboptimal output for large inputs with - many differences. - - Modified to work on strings rather than files - by Peter Miller , October 1995 */ + many differences. */ #include @@ -49,12 +45,14 @@ #include "fstrcmp.h" #include +#include #include #include #include #include "lock.h" #include "tls.h" +#include "minmax.h" #include "xalloc.h" #ifndef uintptr_t @@ -62,503 +60,18 @@ #endif -/* - * Context of comparison operation. - */ -struct context -{ - /* - * Data on one input string being compared. - */ - struct string_data - { - /* The string to be compared. */ - const char *data; - - /* The length of the string to be compared. */ - int data_length; - - /* The number of characters inserted or deleted. */ - int edit_count; - } - string[2]; - - #ifdef MINUS_H_FLAG - - /* This corresponds to the diff -H flag. With this heuristic, for - strings with a constant small density of changes, the algorithm is - linear in the strings size. This is unlikely in typical uses of - fstrcmp, and so is usually compiled out. Besides, there is no - interface to set it true. */ - int heuristic; - - #endif - - /* Vector, indexed by diagonal, containing 1 + the X coordinate of the - point furthest along the given diagonal in the forward search of the - edit matrix. */ - int *fdiag; - - /* Vector, indexed by diagonal, containing the X coordinate of the point - furthest along the given diagonal in the backward search of the edit - matrix. */ - int *bdiag; - - /* Edit scripts longer than this are too expensive to compute. */ - int too_expensive; - - /* Snakes bigger than this are considered `big'. */ - #define SNAKE_LIMIT 20 -}; - -struct partition -{ - /* Midpoints of this partition. */ - int xmid, ymid; - - /* Nonzero if low half will be analyzed minimally. */ - int lo_minimal; - - /* Likewise for high half. */ - int hi_minimal; -}; - - -/* NAME - diag - find diagonal path - - SYNOPSIS - int diag(int xoff, int xlim, int yoff, int ylim, int minimal, - struct partition *part, struct context *ctxt); - - DESCRIPTION - Find the midpoint of the shortest edit script for a specified - portion of the two strings. - - Scan from the beginnings of the strings, and simultaneously from - the ends, doing a breadth-first search through the space of - edit-sequence. When the two searches meet, we have found the - midpoint of the shortest edit sequence. - - If MINIMAL is nonzero, find the minimal edit script regardless - of expense. Otherwise, if the search is too expensive, use - heuristics to stop the search and report a suboptimal answer. - - RETURNS - Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal - number XMID - YMID equals the number of inserted characters - minus the number of deleted characters (counting only characters - before the midpoint). Return the approximate edit cost; this is - the total number of characters inserted or deleted (counting - only characters before the midpoint), unless a heuristic is used - to terminate the search prematurely. - - Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script - for the left half of the partition is known; similarly for - PART->RIGHT_MINIMAL. - - CAVEAT - This function assumes that the first characters of the specified - portions of the two strings do not match, and likewise that the - last characters do not match. The caller must trim matching - characters from the beginning and end of the portions it is - going to specify. - - If we return the "wrong" partitions, the worst this can do is - cause suboptimal diff output. It cannot cause incorrect diff - output. */ - -static int -diag (int xoff, int xlim, int yoff, int ylim, int minimal, - struct partition *part, struct context *ctxt) -{ - int *const fd = ctxt->fdiag; /* Give the compiler a chance. */ - int *const bd = ctxt->bdiag; /* Additional help for the compiler. */ - const char *const xv = ctxt->string[0].data; /* Still more help for the compiler. */ - const char *const yv = ctxt->string[1].data; /* And more and more . . . */ - const int dmin = xoff - ylim; /* Minimum valid diagonal. */ - const int dmax = xlim - yoff; /* Maximum valid diagonal. */ - const int fmid = xoff - yoff; /* Center diagonal of top-down search. */ - const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ - int fmin = fmid; - int fmax = fmid; /* Limits of top-down search. */ - int bmin = bmid; - int bmax = bmid; /* Limits of bottom-up search. */ - int c; /* Cost. */ - int odd = (fmid - bmid) & 1; - - /* - * True if southeast corner is on an odd diagonal with respect - * to the northwest. - */ - fd[fmid] = xoff; - bd[bmid] = xlim; - for (c = 1;; ++c) - { - int d; /* Active diagonal. */ - int big_snake; - - big_snake = 0; - /* Extend the top-down search by an edit step in each diagonal. */ - if (fmin > dmin) - fd[--fmin - 1] = -1; - else - ++fmin; - if (fmax < dmax) - fd[++fmax + 1] = -1; - else - --fmax; - for (d = fmax; d >= fmin; d -= 2) - { - int x; - int y; - int oldx; - int tlo; - int thi; - - tlo = fd[d - 1], - thi = fd[d + 1]; - - if (tlo >= thi) - x = tlo + 1; - else - x = thi; - oldx = x; - y = x - d; - while (x < xlim && y < ylim && xv[x] == yv[y]) - { - ++x; - ++y; - } - if (x - oldx > SNAKE_LIMIT) - big_snake = 1; - fd[d] = x; - if (odd && bmin <= d && d <= bmax && bd[d] <= x) - { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = 1; - return 2 * c - 1; - } - } - /* Similarly extend the bottom-up search. */ - if (bmin > dmin) - bd[--bmin - 1] = INT_MAX; - else - ++bmin; - if (bmax < dmax) - bd[++bmax + 1] = INT_MAX; - else - --bmax; - for (d = bmax; d >= bmin; d -= 2) - { - int x; - int y; - int oldx; - int tlo; - int thi; - - tlo = bd[d - 1], - thi = bd[d + 1]; - if (tlo < thi) - x = tlo; - else - x = thi - 1; - oldx = x; - y = x - d; - while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) - { - --x; - --y; - } - if (oldx - x > SNAKE_LIMIT) - big_snake = 1; - bd[d] = x; - if (!odd && fmin <= d && d <= fmax && x <= fd[d]) - { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = 1; - return 2 * c; - } - } - - if (minimal) - continue; - -#ifdef MINUS_H_FLAG - /* Heuristic: check occasionally for a diagonal that has made lots - of progress compared with the edit distance. If we have any - such, find the one that has made the most progress and return - it as if it had succeeded. - - With this heuristic, for strings with a constant small density - of changes, the algorithm is linear in the strings size. */ - if (c > 200 && big_snake && ctxt->heuristic) - { - int best; - - best = 0; - for (d = fmax; d >= fmin; d -= 2) - { - int dd; - int x; - int y; - int v; - - dd = d - fmid; - x = fd[d]; - y = x - d; - v = (x - xoff) * 2 - dd; - - if (v > 12 * (c + (dd < 0 ? -dd : dd))) - { - if - ( - v > best - && - xoff + SNAKE_LIMIT <= x - && - x < xlim - && - yoff + SNAKE_LIMIT <= y - && - y < ylim - ) - { - /* We have a good enough best diagonal; now insist - that it end with a significant snake. */ - int k; - - for (k = 1; xv[x - k] == yv[y - k]; k++) - { - if (k == SNAKE_LIMIT) - { - best = v; - part->xmid = x; - part->ymid = y; - break; - } - } - } - } - } - if (best > 0) - { - part->lo_minimal = 1; - part->hi_minimal = 0; - return 2 * c - 1; - } - best = 0; - for (d = bmax; d >= bmin; d -= 2) - { - int dd; - int x; - int y; - int v; - - dd = d - bmid; - x = bd[d]; - y = x - d; - v = (xlim - x) * 2 + dd; - - if (v > 12 * (c + (dd < 0 ? -dd : dd))) - { - if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && - yoff < y && y <= ylim - SNAKE_LIMIT) - { - /* We have a good enough best diagonal; now insist - that it end with a significant snake. */ - int k; - - for (k = 0; xv[x + k] == yv[y + k]; k++) - { - if (k == SNAKE_LIMIT - 1) - { - best = v; - part->xmid = x; - part->ymid = y; - break; - } - } - } - } - } - if (best > 0) - { - part->lo_minimal = 0; - part->hi_minimal = 1; - return 2 * c - 1; - } - } -#endif /* MINUS_H_FLAG */ - - /* Heuristic: if we've gone well beyond the call of duty, give up - and report halfway between our best results so far. */ - if (c >= ctxt->too_expensive) - { - int fxybest; - int fxbest; - int bxybest; - int bxbest; - - /* Pacify `gcc -Wall'. */ - fxbest = 0; - bxbest = 0; - - /* Find forward diagonal that maximizes X + Y. */ - fxybest = -1; - for (d = fmax; d >= fmin; d -= 2) - { - int x; - int y; - - x = fd[d] < xlim ? fd[d] : xlim; - y = x - d; - - if (ylim < y) - { - x = ylim + d; - y = ylim; - } - if (fxybest < x + y) - { - fxybest = x + y; - fxbest = x; - } - } - /* Find backward diagonal that minimizes X + Y. */ - bxybest = INT_MAX; - for (d = bmax; d >= bmin; d -= 2) - { - int x; - int y; - - x = xoff > bd[d] ? xoff : bd[d]; - y = x - d; - - if (y < yoff) - { - x = yoff + d; - y = yoff; - } - if (x + y < bxybest) - { - bxybest = x + y; - bxbest = x; - } - } - /* Use the better of the two diagonals. */ - if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) - { - part->xmid = fxbest; - part->ymid = fxybest - fxbest; - part->lo_minimal = 1; - part->hi_minimal = 0; - } - else - { - part->xmid = bxbest; - part->ymid = bxybest - bxbest; - part->lo_minimal = 0; - part->hi_minimal = 1; - } - return 2 * c - 1; - } - } -} - - -/* NAME - compareseq - find edit sequence - - SYNOPSIS - void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, - struct context *ctxt); - - DESCRIPTION - Compare in detail contiguous subsequences of the two strings - which are known, as a whole, to match each other. - - The subsequence of string 0 is [XOFF, XLIM) and likewise for - string 1. - - Note that XLIM, YLIM are exclusive bounds. All character - numbers are origin-0. - - If MINIMAL is nonzero, find a minimal difference no matter how - expensive it is. */ - -static void -compareseq (int xoff, int xlim, int yoff, int ylim, int minimal, - struct context *ctxt) -{ - const char *const xv = ctxt->string[0].data; /* Help the compiler. */ - const char *const yv = ctxt->string[1].data; - - /* Slide down the bottom initial diagonal. */ - while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) - { - ++xoff; - ++yoff; - } - - /* Slide up the top initial diagonal. */ - while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) - { - --xlim; - --ylim; - } - - /* Handle simple cases. */ - if (xoff == xlim) - { - while (yoff < ylim) - { - ctxt->string[1].edit_count++; - ++yoff; - } - } - else if (yoff == ylim) - { - while (xoff < xlim) - { - ctxt->string[0].edit_count++; - ++xoff; - } - } - else - { - int c; - struct partition part; - - /* Find a point of correspondence in the middle of the strings. */ - c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt); - if (c == 1) - { -#if 0 - /* This should be impossible, because it implies that one of - the two subsequences is empty, and that case was handled - above without calling `diag'. Let's verify that this is - true. */ - abort (); -#else - /* The two subsequences differ by a single insert or delete; - record it and we are done. */ - if (part.xmid - part.ymid < xoff - yoff) - ctxt->string[1].edit_count++; - else - ctxt->string[0].edit_count++; -#endif - } - else - { - /* Use the partitions to split this problem into subproblems. */ - compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt); - compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt); - } - } -} +#define ELEMENT char +#define EQUAL(x,y) ((x) == (y)) +#define OFFSET int +#define EXTRA_CONTEXT_FIELDS \ + /* The number of elements inserted or deleted. */ \ + int xvec_edit_count; \ + int yvec_edit_count; +#define NOTE_DELETE(ctxt, xoff) ctxt->xvec_edit_count++ +#define NOTE_INSERT(ctxt, yoff) ctxt->yvec_edit_count++ +/* We don't need USE_HEURISTIC, since it is unlikely in typical uses of + fstrcmp(). */ +#include "diffseq.h" /* Because fstrcmp is typically called multiple times, attempt to minimize @@ -580,7 +93,7 @@ keys_init (void) } /* Ensure that keys_init is called once only. */ -gl_once_define(static, keys_init_once); +gl_once_define(static, keys_init_once) /* NAME @@ -604,6 +117,8 @@ double fstrcmp (const char *string1, const char *string2) { struct context ctxt; + int xvec_length; + int yvec_length; int i; size_t fdiag_len; @@ -611,21 +126,21 @@ fstrcmp (const char *string1, const char *string2) size_t bufmax; /* set the info for each string. */ - ctxt.string[0].data = string1; - ctxt.string[0].data_length = strlen (string1); - ctxt.string[1].data = string2; - ctxt.string[1].data_length = strlen (string2); + ctxt.xvec = string1; + xvec_length = strlen (string1); + ctxt.yvec = string2; + yvec_length = strlen (string2); /* short-circuit obvious comparisons */ - if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0) + if (xvec_length == 0 && yvec_length == 0) return 1.0; - if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0) + if (xvec_length == 0 || yvec_length == 0) return 0.0; /* Set TOO_EXPENSIVE to be approximate square root of input size, bounded below by 256. */ ctxt.too_expensive = 1; - for (i = ctxt.string[0].data_length + ctxt.string[1].data_length; + for (i = xvec_length + yvec_length; i != 0; i >>= 2) ctxt.too_expensive <<= 1; @@ -633,7 +148,7 @@ fstrcmp (const char *string1, const char *string2) ctxt.too_expensive = 256; /* Allocate memory for fdiag and bdiag from a thread-local pool. */ - fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3; + fdiag_len = xvec_length + yvec_length + 3; gl_once (keys_init_once, keys_init); buffer = (int *) gl_tls_get (buffer_key); bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key); @@ -647,24 +162,24 @@ fstrcmp (const char *string1, const char *string2) to be preserved. */ if (buffer != NULL) free (buffer); - buffer = (int *) xmalloc (bufmax * (2 * sizeof (int))); + buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int)); gl_tls_set (buffer_key, buffer); gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); } - ctxt.fdiag = buffer + ctxt.string[1].data_length + 1; + ctxt.fdiag = buffer + yvec_length + 1; ctxt.bdiag = ctxt.fdiag + fdiag_len; /* Now do the main comparison algorithm */ - ctxt.string[0].edit_count = 0; - ctxt.string[1].edit_count = 0; - compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0, + ctxt.xvec_edit_count = 0; + ctxt.yvec_edit_count = 0; + compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt); /* The result is ((number of chars in common) / (average length of the strings)). This is admittedly biased towards finding that the strings are similar, however it does produce meaningful results. */ - return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length - - ctxt.string[1].edit_count - ctxt.string[0].edit_count) - / (ctxt.string[0].data_length + ctxt.string[1].data_length)); + return ((double) (xvec_length + yvec_length + - ctxt.yvec_edit_count - ctxt.xvec_edit_count) + / (xvec_length + yvec_length)); }