X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Ffstrcmp.c;h=595c1f3744b29431cf529ddf712230741b996915;hb=fac94349f25305566277d15b965fa8f4efe66613;hp=9bd742996863fc783a74e2b8794191720f2ea8dd;hpb=7fef1769f5bc137624077568f1a5016dbcee2fda;p=gnulib.git diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c index 9bd742996..595c1f374 100644 --- a/lib/fstrcmp.c +++ b/lib/fstrcmp.c @@ -1,28 +1,28 @@ /* Functions to make fuzzy comparisons between strings - Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc. + Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2011 Free + Software Foundation, Inc. - This program is free software; you can redistribute it and/or modify + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or (at - your option) any later version. + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + along with this program. If not, see . Derived from GNU diff 2.7, analyze.c et al. - The basic idea is to consider two strings as similar if, when - transforming the first string into the second string through a - sequence of edits (inserts and deletes of one character each), + The basic idea is to consider two vectors as similar if, when + transforming the first vector into the second vector through a + sequence of edits (inserts and deletes of one element each), this sequence is short - or equivalently, if the ordered list - of characters that are untouched by these edits is long. For a + of elements that are untouched by these edits is long. For a good introduction to the subject, read about the "Levenshtein distance" in Wikipedia. @@ -35,13 +35,10 @@ "Algorithms for Approximate String Matching", E. Ukkonen, Information and Control Vol. 64, 1985, pp. 100-118. - Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE + Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) at the price of producing suboptimal output for large inputs with - many differences. - - Modified to work on strings rather than files - by Peter Miller , October 1995 */ + many differences. */ #include @@ -49,12 +46,14 @@ #include "fstrcmp.h" #include +#include #include #include #include -#include "lock.h" -#include "tls.h" +#include "glthread/lock.h" +#include "glthread/tls.h" +#include "minmax.h" #include "xalloc.h" #ifndef uintptr_t @@ -62,503 +61,21 @@ #endif -/* - * Context of comparison operation. - */ -struct context -{ - /* - * Data on one input string being compared. - */ - struct string_data - { - /* The string to be compared. */ - const char *data; - - /* The length of the string to be compared. */ - int data_length; - - /* The number of characters inserted or deleted. */ - int edit_count; - } - string[2]; - - #ifdef MINUS_H_FLAG - - /* This corresponds to the diff -H flag. With this heuristic, for - strings with a constant small density of changes, the algorithm is - linear in the strings size. This is unlikely in typical uses of - fstrcmp, and so is usually compiled out. Besides, there is no - interface to set it true. */ - int heuristic; - - #endif - - /* Vector, indexed by diagonal, containing 1 + the X coordinate of the - point furthest along the given diagonal in the forward search of the - edit matrix. */ - int *fdiag; - - /* Vector, indexed by diagonal, containing the X coordinate of the point - furthest along the given diagonal in the backward search of the edit - matrix. */ - int *bdiag; - - /* Edit scripts longer than this are too expensive to compute. */ - int too_expensive; - - /* Snakes bigger than this are considered `big'. */ - #define SNAKE_LIMIT 20 -}; - -struct partition -{ - /* Midpoints of this partition. */ - int xmid, ymid; - - /* Nonzero if low half will be analyzed minimally. */ - int lo_minimal; - - /* Likewise for high half. */ - int hi_minimal; -}; - - -/* NAME - diag - find diagonal path - - SYNOPSIS - int diag(int xoff, int xlim, int yoff, int ylim, int minimal, - struct partition *part, struct context *ctxt); - - DESCRIPTION - Find the midpoint of the shortest edit script for a specified - portion of the two strings. - - Scan from the beginnings of the strings, and simultaneously from - the ends, doing a breadth-first search through the space of - edit-sequence. When the two searches meet, we have found the - midpoint of the shortest edit sequence. - - If MINIMAL is nonzero, find the minimal edit script regardless - of expense. Otherwise, if the search is too expensive, use - heuristics to stop the search and report a suboptimal answer. - - RETURNS - Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal - number XMID - YMID equals the number of inserted characters - minus the number of deleted characters (counting only characters - before the midpoint). Return the approximate edit cost; this is - the total number of characters inserted or deleted (counting - only characters before the midpoint), unless a heuristic is used - to terminate the search prematurely. - - Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script - for the left half of the partition is known; similarly for - PART->RIGHT_MINIMAL. - - CAVEAT - This function assumes that the first characters of the specified - portions of the two strings do not match, and likewise that the - last characters do not match. The caller must trim matching - characters from the beginning and end of the portions it is - going to specify. - - If we return the "wrong" partitions, the worst this can do is - cause suboptimal diff output. It cannot cause incorrect diff - output. */ - -static int -diag (int xoff, int xlim, int yoff, int ylim, int minimal, - struct partition *part, struct context *ctxt) -{ - int *const fd = ctxt->fdiag; /* Give the compiler a chance. */ - int *const bd = ctxt->bdiag; /* Additional help for the compiler. */ - const char *const xv = ctxt->string[0].data; /* Still more help for the compiler. */ - const char *const yv = ctxt->string[1].data; /* And more and more . . . */ - const int dmin = xoff - ylim; /* Minimum valid diagonal. */ - const int dmax = xlim - yoff; /* Maximum valid diagonal. */ - const int fmid = xoff - yoff; /* Center diagonal of top-down search. */ - const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ - int fmin = fmid; - int fmax = fmid; /* Limits of top-down search. */ - int bmin = bmid; - int bmax = bmid; /* Limits of bottom-up search. */ - int c; /* Cost. */ - int odd = (fmid - bmid) & 1; - - /* - * True if southeast corner is on an odd diagonal with respect - * to the northwest. - */ - fd[fmid] = xoff; - bd[bmid] = xlim; - for (c = 1;; ++c) - { - int d; /* Active diagonal. */ - int big_snake; - - big_snake = 0; - /* Extend the top-down search by an edit step in each diagonal. */ - if (fmin > dmin) - fd[--fmin - 1] = -1; - else - ++fmin; - if (fmax < dmax) - fd[++fmax + 1] = -1; - else - --fmax; - for (d = fmax; d >= fmin; d -= 2) - { - int x; - int y; - int oldx; - int tlo; - int thi; - - tlo = fd[d - 1], - thi = fd[d + 1]; - - if (tlo >= thi) - x = tlo + 1; - else - x = thi; - oldx = x; - y = x - d; - while (x < xlim && y < ylim && xv[x] == yv[y]) - { - ++x; - ++y; - } - if (x - oldx > SNAKE_LIMIT) - big_snake = 1; - fd[d] = x; - if (odd && bmin <= d && d <= bmax && bd[d] <= x) - { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = 1; - return 2 * c - 1; - } - } - /* Similarly extend the bottom-up search. */ - if (bmin > dmin) - bd[--bmin - 1] = INT_MAX; - else - ++bmin; - if (bmax < dmax) - bd[++bmax + 1] = INT_MAX; - else - --bmax; - for (d = bmax; d >= bmin; d -= 2) - { - int x; - int y; - int oldx; - int tlo; - int thi; - - tlo = bd[d - 1], - thi = bd[d + 1]; - if (tlo < thi) - x = tlo; - else - x = thi - 1; - oldx = x; - y = x - d; - while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) - { - --x; - --y; - } - if (oldx - x > SNAKE_LIMIT) - big_snake = 1; - bd[d] = x; - if (!odd && fmin <= d && d <= fmax && x <= fd[d]) - { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = 1; - return 2 * c; - } - } - - if (minimal) - continue; - -#ifdef MINUS_H_FLAG - /* Heuristic: check occasionally for a diagonal that has made lots - of progress compared with the edit distance. If we have any - such, find the one that has made the most progress and return - it as if it had succeeded. - - With this heuristic, for strings with a constant small density - of changes, the algorithm is linear in the strings size. */ - if (c > 200 && big_snake && ctxt->heuristic) - { - int best; - - best = 0; - for (d = fmax; d >= fmin; d -= 2) - { - int dd; - int x; - int y; - int v; - - dd = d - fmid; - x = fd[d]; - y = x - d; - v = (x - xoff) * 2 - dd; - - if (v > 12 * (c + (dd < 0 ? -dd : dd))) - { - if - ( - v > best - && - xoff + SNAKE_LIMIT <= x - && - x < xlim - && - yoff + SNAKE_LIMIT <= y - && - y < ylim - ) - { - /* We have a good enough best diagonal; now insist - that it end with a significant snake. */ - int k; - - for (k = 1; xv[x - k] == yv[y - k]; k++) - { - if (k == SNAKE_LIMIT) - { - best = v; - part->xmid = x; - part->ymid = y; - break; - } - } - } - } - } - if (best > 0) - { - part->lo_minimal = 1; - part->hi_minimal = 0; - return 2 * c - 1; - } - best = 0; - for (d = bmax; d >= bmin; d -= 2) - { - int dd; - int x; - int y; - int v; - - dd = d - bmid; - x = bd[d]; - y = x - d; - v = (xlim - x) * 2 + dd; - - if (v > 12 * (c + (dd < 0 ? -dd : dd))) - { - if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && - yoff < y && y <= ylim - SNAKE_LIMIT) - { - /* We have a good enough best diagonal; now insist - that it end with a significant snake. */ - int k; - - for (k = 0; xv[x + k] == yv[y + k]; k++) - { - if (k == SNAKE_LIMIT - 1) - { - best = v; - part->xmid = x; - part->ymid = y; - break; - } - } - } - } - } - if (best > 0) - { - part->lo_minimal = 0; - part->hi_minimal = 1; - return 2 * c - 1; - } - } -#endif /* MINUS_H_FLAG */ - - /* Heuristic: if we've gone well beyond the call of duty, give up - and report halfway between our best results so far. */ - if (c >= ctxt->too_expensive) - { - int fxybest; - int fxbest; - int bxybest; - int bxbest; - - /* Pacify `gcc -Wall'. */ - fxbest = 0; - bxbest = 0; - - /* Find forward diagonal that maximizes X + Y. */ - fxybest = -1; - for (d = fmax; d >= fmin; d -= 2) - { - int x; - int y; - - x = fd[d] < xlim ? fd[d] : xlim; - y = x - d; - - if (ylim < y) - { - x = ylim + d; - y = ylim; - } - if (fxybest < x + y) - { - fxybest = x + y; - fxbest = x; - } - } - /* Find backward diagonal that minimizes X + Y. */ - bxybest = INT_MAX; - for (d = bmax; d >= bmin; d -= 2) - { - int x; - int y; - - x = xoff > bd[d] ? xoff : bd[d]; - y = x - d; - - if (y < yoff) - { - x = yoff + d; - y = yoff; - } - if (x + y < bxybest) - { - bxybest = x + y; - bxbest = x; - } - } - /* Use the better of the two diagonals. */ - if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) - { - part->xmid = fxbest; - part->ymid = fxybest - fxbest; - part->lo_minimal = 1; - part->hi_minimal = 0; - } - else - { - part->xmid = bxbest; - part->ymid = bxybest - bxbest; - part->lo_minimal = 0; - part->hi_minimal = 1; - } - return 2 * c - 1; - } - } -} - - -/* NAME - compareseq - find edit sequence - - SYNOPSIS - void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, - struct context *ctxt); - - DESCRIPTION - Compare in detail contiguous subsequences of the two strings - which are known, as a whole, to match each other. - - The subsequence of string 0 is [XOFF, XLIM) and likewise for - string 1. - - Note that XLIM, YLIM are exclusive bounds. All character - numbers are origin-0. - - If MINIMAL is nonzero, find a minimal difference no matter how - expensive it is. */ - -static void -compareseq (int xoff, int xlim, int yoff, int ylim, int minimal, - struct context *ctxt) -{ - const char *const xv = ctxt->string[0].data; /* Help the compiler. */ - const char *const yv = ctxt->string[1].data; - - /* Slide down the bottom initial diagonal. */ - while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) - { - ++xoff; - ++yoff; - } - - /* Slide up the top initial diagonal. */ - while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) - { - --xlim; - --ylim; - } - - /* Handle simple cases. */ - if (xoff == xlim) - { - while (yoff < ylim) - { - ctxt->string[1].edit_count++; - ++yoff; - } - } - else if (yoff == ylim) - { - while (xoff < xlim) - { - ctxt->string[0].edit_count++; - ++xoff; - } - } - else - { - int c; - struct partition part; - - /* Find a point of correspondence in the middle of the strings. */ - c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt); - if (c == 1) - { -#if 0 - /* This should be impossible, because it implies that one of - the two subsequences is empty, and that case was handled - above without calling `diag'. Let's verify that this is - true. */ - abort (); -#else - /* The two subsequences differ by a single insert or delete; - record it and we are done. */ - if (part.xmid - part.ymid < xoff - yoff) - ctxt->string[1].edit_count++; - else - ctxt->string[0].edit_count++; -#endif - } - else - { - /* Use the partitions to split this problem into subproblems. */ - compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt); - compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt); - } - } -} +#define ELEMENT char +#define EQUAL(x,y) ((x) == (y)) +#define OFFSET int +#define EXTRA_CONTEXT_FIELDS \ + /* The number of edits beyond which the computation can be aborted. */ \ + int edit_count_limit; \ + /* The number of edits (= number of elements inserted, plus the number of \ + elements deleted), temporarily minus edit_count_limit. */ \ + int edit_count; +#define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++ +#define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++ +#define EARLY_ABORT(ctxt) ctxt->edit_count > 0 +/* We don't need USE_HEURISTIC, since it is unlikely in typical uses of + fstrcmp(). */ +#include "diffseq.h" /* Because fstrcmp is typically called multiple times, attempt to minimize @@ -568,8 +85,8 @@ compareseq (int xoff, int xlim, int yoff, int ylim, int minimal, already allocated memory, store the allocated memory per thread. Free it only when the thread exits. */ -static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ -static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ +static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ +static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ static void keys_init (void) @@ -580,52 +97,116 @@ keys_init (void) } /* Ensure that keys_init is called once only. */ -gl_once_define(static, keys_init_once); - - -/* NAME - fstrcmp - fuzzy string compare +gl_once_define(static, keys_init_once) - SYNOPSIS - double fstrcmp(const char *, const char *); - DESCRIPTION - The fstrcmp function may be used to compare two string for - similarity. It is very useful in reducing "cascade" or - "secondary" errors in compilers or other situations where - symbol tables occur. +/* In the code below, branch probabilities were measured by Ralf Wildenhues, + by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many + values of LL. The probability indicates that the condition evaluates + to true; whether that leads to a branch or a non-branch in the code, + depends on the compiler's reordering of basic blocks. */ - RETURNS - double; 0 if the strings are entirly dissimilar, 1 if the - strings are identical, and a number in between if they are - similar. */ double -fstrcmp (const char *string1, const char *string2) +fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) { struct context ctxt; + int xvec_length = strlen (string1); + int yvec_length = strlen (string2); int i; size_t fdiag_len; int *buffer; size_t bufmax; - /* set the info for each string. */ - ctxt.string[0].data = string1; - ctxt.string[0].data_length = strlen (string1); - ctxt.string[1].data = string2; - ctxt.string[1].data_length = strlen (string2); - /* short-circuit obvious comparisons */ - if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0) - return 1.0; - if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0) - return 0.0; + if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */ + return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0); + + if (lower_bound > 0) + { + /* Compute a quick upper bound. + Each edit is an insertion or deletion of an element, hence modifies + the length of the sequence by at most 1. + Therefore, when starting from a sequence X and ending at a sequence Y, + with N edits, | yvec_length - xvec_length | <= N. (Proof by + induction over N.) + So, at the end, we will have + edit_count >= | xvec_length - yvec_length |. + and hence + result + = (xvec_length + yvec_length - edit_count) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - | yvec_length - xvec_length |) + / (xvec_length + yvec_length) + = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). + */ + volatile double upper_bound = + (double) (2 * MIN (xvec_length, yvec_length)) + / (xvec_length + yvec_length); + + if (upper_bound < lower_bound) /* Prob: 74% */ + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; + +#if CHAR_BIT <= 8 + /* When X and Y are both small, avoid the overhead of setting up an + array of size 256. */ + if (xvec_length + yvec_length >= 20) /* Prob: 99% */ + { + /* Compute a less quick upper bound. + Each edit is an insertion or deletion of a character, hence + modifies the occurrence count of a character by 1 and leaves the + other occurrence counts unchanged. + Therefore, when starting from a sequence X and ending at a + sequence Y, and denoting the occurrence count of C in X with + OCC (X, C), with N edits, + sum_C | OCC (X, C) - OCC (Y, C) | <= N. + (Proof by induction over N.) + So, at the end, we will have + edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, + and hence + result + = (xvec_length + yvec_length - edit_count) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) + / (xvec_length + yvec_length). + */ + int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ + int sum; + + /* Determine the occurrence counts in X. */ + memset (occ_diff, 0, sizeof (occ_diff)); + for (i = xvec_length - 1; i >= 0; i--) + occ_diff[(unsigned char) string1[i]]++; + /* Subtract the occurrence counts in Y. */ + for (i = yvec_length - 1; i >= 0; i--) + occ_diff[(unsigned char) string2[i]]--; + /* Sum up the absolute values. */ + sum = 0; + for (i = 0; i <= UCHAR_MAX; i++) + { + int d = occ_diff[i]; + sum += (d >= 0 ? d : -d); + } + + upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length); + + if (upper_bound < lower_bound) /* Prob: 66% */ + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; + } +#endif + } + + /* set the info for each string. */ + ctxt.xvec = string1; + ctxt.yvec = string2; /* Set TOO_EXPENSIVE to be approximate square root of input size, bounded below by 256. */ ctxt.too_expensive = 1; - for (i = ctxt.string[0].data_length + ctxt.string[1].data_length; + for (i = xvec_length + yvec_length; i != 0; i >>= 2) ctxt.too_expensive <<= 1; @@ -633,7 +214,7 @@ fstrcmp (const char *string1, const char *string2) ctxt.too_expensive = 256; /* Allocate memory for fdiag and bdiag from a thread-local pool. */ - fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3; + fdiag_len = xvec_length + yvec_length + 3; gl_once (keys_init_once, keys_init); buffer = (int *) gl_tls_get (buffer_key); bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key); @@ -642,29 +223,49 @@ fstrcmp (const char *string1, const char *string2) /* Need more memory. */ bufmax = 2 * bufmax; if (fdiag_len > bufmax) - bufmax = fdiag_len; + bufmax = fdiag_len; /* Calling xrealloc would be a waste: buffer's contents does not need - to be preserved. */ + to be preserved. */ if (buffer != NULL) - free (buffer); - buffer = (int *) xmalloc (bufmax * (2 * sizeof (int))); + free (buffer); + buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int)); gl_tls_set (buffer_key, buffer); gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); } - ctxt.fdiag = buffer + ctxt.string[1].data_length + 1; + ctxt.fdiag = buffer + yvec_length + 1; ctxt.bdiag = ctxt.fdiag + fdiag_len; + /* The edit_count is only ever increased. The computation can be aborted + when + (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) + < lower_bound, + or equivalently + edit_count > (xvec_length + yvec_length) * (1 - lower_bound) + or equivalently + edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)). + We need to add an epsilon inside the floor(...) argument, to neutralize + rounding errors. */ + ctxt.edit_count_limit = + (lower_bound < 1.0 + ? (int) ((xvec_length + yvec_length) * (1.0 - lower_bound + 0.000001)) + : 0); + /* Now do the main comparison algorithm */ - ctxt.string[0].edit_count = 0; - ctxt.string[1].edit_count = 0; - compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0, - &ctxt); + ctxt.edit_count = - ctxt.edit_count_limit; + if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */ + /* The edit_count passed the limit. Hence the result would be + < lower_bound. We can return any value < lower_bound instead. */ + return 0.0; + ctxt.edit_count += ctxt.edit_count_limit; /* The result is - ((number of chars in common) / (average length of the strings)). + ((number of chars in common) / (average length of the strings)). + The numerator is + = xvec_length - (number of calls to NOTE_DELETE) + = yvec_length - (number of calls to NOTE_INSERT) + = 1/2 * (xvec_length + yvec_length - (number of edits)). This is admittedly biased towards finding that the strings are similar, however it does produce meaningful results. */ - return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length - - ctxt.string[1].edit_count - ctxt.string[0].edit_count) - / (ctxt.string[0].data_length + ctxt.string[1].data_length)); + return ((double) (xvec_length + yvec_length - ctxt.edit_count) + / (xvec_length + yvec_length)); }