source: trunk/third/gettext/lib/fstrcmp.c @ 16931

Revision 16931, 15.4 KB checked in by ghudson, 23 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r16930, which included commits to RCS files with non-trunk default branches.
Line 
1/* Functions to make fuzzy comparisons between strings
2   Copyright (C) 1988-1989, 1992-1993, 1995, 2001 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2 of the License, or (at
7   your option) any later version.
8
9   This program is distributed in the hope that it will be useful, but
10   WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18
19   Derived from GNU diff 2.7, analyze.c et al.
20
21   The basic algorithm is described in:
22   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
23   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
24   see especially section 4.2, which describes the variation used below.
25
26   The basic algorithm was independently discovered as described in:
27   "Algorithms for Approximate String Matching", E. Ukkonen,
28   Information and Control Vol. 64, 1985, pp. 100-118.
29
30   Modified to work on strings rather than files
31   by Peter Miller <pmiller@agso.gov.au>, October 1995 */
32
33#ifdef HAVE_CONFIG_H
34# include "config.h"
35#endif
36
37#include <string.h>
38#include <stdio.h>
39
40#ifdef HAVE_LIMITS_H
41# include <limits.h>
42#else
43# define INT_MAX ((int)(~(unsigned)0 >> 1))
44#endif
45
46#include "system.h"
47#include "fstrcmp.h"
48
49
50/*
51 * Data on one input string being compared.
52 */
53struct string_data
54{
55  /* The string to be compared. */
56  const char *data;
57
58  /* The length of the string to be compared. */
59  int data_length;
60
61  /* The number of characters inserted or deleted. */
62  int edit_count;
63};
64
65static struct string_data string[2];
66
67
68#ifdef MINUS_H_FLAG
69
70/* This corresponds to the diff -H flag.  With this heuristic, for
71   strings with a constant small density of changes, the algorithm is
72   linear in the strings size.  This is unlikely in typical uses of
73   fstrcmp, and so is usually compiled out.  Besides, there is no
74   interface to set it true.  */
75static int heuristic;
76
77#endif
78
79
80/* Vector, indexed by diagonal, containing 1 + the X coordinate of the
81   point furthest along the given diagonal in the forward search of the
82   edit matrix.  */
83static int *fdiag;
84
85/* Vector, indexed by diagonal, containing the X coordinate of the point
86   furthest along the given diagonal in the backward search of the edit
87   matrix.  */
88static int *bdiag;
89
90/* Edit scripts longer than this are too expensive to compute.  */
91static int too_expensive;
92
93/* Snakes bigger than this are considered `big'.  */
94#define SNAKE_LIMIT     20
95
96struct partition
97{
98  /* Midpoints of this partition.  */
99  int xmid, ymid;
100
101  /* Nonzero if low half will be analyzed minimally.  */
102  int lo_minimal;
103
104  /* Likewise for high half.  */
105  int hi_minimal;
106};
107
108
109/* NAME
110        diag - find diagonal path
111
112   SYNOPSIS
113        int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
114                struct partition *part);
115
116   DESCRIPTION
117        Find the midpoint of the shortest edit script for a specified
118        portion of the two strings.
119
120        Scan from the beginnings of the strings, and simultaneously from
121        the ends, doing a breadth-first search through the space of
122        edit-sequence.  When the two searches meet, we have found the
123        midpoint of the shortest edit sequence.
124
125        If MINIMAL is nonzero, find the minimal edit script regardless
126        of expense.  Otherwise, if the search is too expensive, use
127        heuristics to stop the search and report a suboptimal answer.
128
129   RETURNS
130        Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
131        number XMID - YMID equals the number of inserted characters
132        minus the number of deleted characters (counting only characters
133        before the midpoint).  Return the approximate edit cost; this is
134        the total number of characters inserted or deleted (counting
135        only characters before the midpoint), unless a heuristic is used
136        to terminate the search prematurely.
137
138        Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
139        for the left half of the partition is known; similarly for
140        PART->RIGHT_MINIMAL.
141
142   CAVEAT
143        This function assumes that the first characters of the specified
144        portions of the two strings do not match, and likewise that the
145        last characters do not match.  The caller must trim matching
146        characters from the beginning and end of the portions it is
147        going to specify.
148
149        If we return the "wrong" partitions, the worst this can do is
150        cause suboptimal diff output.  It cannot cause incorrect diff
151        output.  */
152
153static int diag PARAMS ((int, int, int, int, int, struct partition *));
154
155static int
156diag (xoff, xlim, yoff, ylim, minimal, part)
157     int xoff;
158     int xlim;
159     int yoff;
160     int ylim;
161     int minimal;
162     struct partition *part;
163{
164  int *const fd = fdiag;        /* Give the compiler a chance. */
165  int *const bd = bdiag;        /* Additional help for the compiler. */
166  const char *const xv = string[0].data;        /* Still more help for the compiler. */
167  const char *const yv = string[1].data;        /* And more and more . . . */
168  const int dmin = xoff - ylim; /* Minimum valid diagonal. */
169  const int dmax = xlim - yoff; /* Maximum valid diagonal. */
170  const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
171  const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
172  int fmin = fmid;
173  int fmax = fmid;              /* Limits of top-down search. */
174  int bmin = bmid;
175  int bmax = bmid;              /* Limits of bottom-up search. */
176  int c;                        /* Cost. */
177  int odd = (fmid - bmid) & 1;
178
179  /*
180         * True if southeast corner is on an odd diagonal with respect
181         * to the northwest.
182         */
183  fd[fmid] = xoff;
184  bd[bmid] = xlim;
185  for (c = 1;; ++c)
186    {
187      int d;                    /* Active diagonal. */
188      int big_snake;
189
190      big_snake = 0;
191      /* Extend the top-down search by an edit step in each diagonal. */
192      if (fmin > dmin)
193        fd[--fmin - 1] = -1;
194      else
195        ++fmin;
196      if (fmax < dmax)
197        fd[++fmax + 1] = -1;
198      else
199        --fmax;
200      for (d = fmax; d >= fmin; d -= 2)
201        {
202          int x;
203          int y;
204          int oldx;
205          int tlo;
206          int thi;
207
208          tlo = fd[d - 1],
209            thi = fd[d + 1];
210
211          if (tlo >= thi)
212            x = tlo + 1;
213          else
214            x = thi;
215          oldx = x;
216          y = x - d;
217          while (x < xlim && y < ylim && xv[x] == yv[y])
218            {
219              ++x;
220              ++y;
221            }
222          if (x - oldx > SNAKE_LIMIT)
223            big_snake = 1;
224          fd[d] = x;
225          if (odd && bmin <= d && d <= bmax && bd[d] <= x)
226            {
227              part->xmid = x;
228              part->ymid = y;
229              part->lo_minimal = part->hi_minimal = 1;
230              return 2 * c - 1;
231            }
232        }
233      /* Similarly extend the bottom-up search.  */
234      if (bmin > dmin)
235        bd[--bmin - 1] = INT_MAX;
236      else
237        ++bmin;
238      if (bmax < dmax)
239        bd[++bmax + 1] = INT_MAX;
240      else
241        --bmax;
242      for (d = bmax; d >= bmin; d -= 2)
243        {
244          int x;
245          int y;
246          int oldx;
247          int tlo;
248          int thi;
249
250          tlo = bd[d - 1],
251            thi = bd[d + 1];
252          if (tlo < thi)
253            x = tlo;
254          else
255            x = thi - 1;
256          oldx = x;
257          y = x - d;
258          while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
259            {
260              --x;
261              --y;
262            }
263          if (oldx - x > SNAKE_LIMIT)
264            big_snake = 1;
265          bd[d] = x;
266          if (!odd && fmin <= d && d <= fmax && x <= fd[d])
267            {
268              part->xmid = x;
269              part->ymid = y;
270              part->lo_minimal = part->hi_minimal = 1;
271              return 2 * c;
272            }
273        }
274
275      if (minimal)
276        continue;
277
278#ifdef MINUS_H_FLAG
279      /* Heuristic: check occasionally for a diagonal that has made lots
280         of progress compared with the edit distance.  If we have any
281         such, find the one that has made the most progress and return
282         it as if it had succeeded.
283
284         With this heuristic, for strings with a constant small density
285         of changes, the algorithm is linear in the strings size.  */
286      if (c > 200 && big_snake && heuristic)
287        {
288          int best;
289
290          best = 0;
291          for (d = fmax; d >= fmin; d -= 2)
292            {
293              int dd;
294              int x;
295              int y;
296              int v;
297
298              dd = d - fmid;
299              x = fd[d];
300              y = x - d;
301              v = (x - xoff) * 2 - dd;
302
303              if (v > 12 * (c + (dd < 0 ? -dd : dd)))
304                {
305                  if
306                    (
307                      v > best
308                      &&
309                      xoff + SNAKE_LIMIT <= x
310                      &&
311                      x < xlim
312                      &&
313                      yoff + SNAKE_LIMIT <= y
314                      &&
315                      y < ylim
316                    )
317                    {
318                      /* We have a good enough best diagonal; now insist
319                         that it end with a significant snake.  */
320                      int k;
321
322                      for (k = 1; xv[x - k] == yv[y - k]; k++)
323                        {
324                          if (k == SNAKE_LIMIT)
325                            {
326                              best = v;
327                              part->xmid = x;
328                              part->ymid = y;
329                              break;
330                            }
331                        }
332                    }
333                }
334            }
335          if (best > 0)
336            {
337              part->lo_minimal = 1;
338              part->hi_minimal = 0;
339              return 2 * c - 1;
340            }
341          best = 0;
342          for (d = bmax; d >= bmin; d -= 2)
343            {
344              int dd;
345              int x;
346              int y;
347              int v;
348
349              dd = d - bmid;
350              x = bd[d];
351              y = x - d;
352              v = (xlim - x) * 2 + dd;
353
354              if (v > 12 * (c + (dd < 0 ? -dd : dd)))
355                {
356                  if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
357                      yoff < y && y <= ylim - SNAKE_LIMIT)
358                    {
359                      /* We have a good enough best diagonal; now insist
360                         that it end with a significant snake.  */
361                      int k;
362
363                      for (k = 0; xv[x + k] == yv[y + k]; k++)
364                        {
365                          if (k == SNAKE_LIMIT - 1)
366                            {
367                              best = v;
368                              part->xmid = x;
369                              part->ymid = y;
370                              break;
371                            }
372                        }
373                    }
374                }
375            }
376          if (best > 0)
377            {
378              part->lo_minimal = 0;
379              part->hi_minimal = 1;
380              return 2 * c - 1;
381            }
382        }
383#endif /* MINUS_H_FLAG */
384
385      /* Heuristic: if we've gone well beyond the call of duty, give up
386         and report halfway between our best results so far.  */
387      if (c >= too_expensive)
388        {
389          int fxybest;
390          int fxbest;
391          int bxybest;
392          int bxbest;
393
394          /* Pacify `gcc -Wall'. */
395          fxbest = 0;
396          bxbest = 0;
397
398          /* Find forward diagonal that maximizes X + Y.  */
399          fxybest = -1;
400          for (d = fmax; d >= fmin; d -= 2)
401            {
402              int x;
403              int y;
404
405              x = fd[d] < xlim ? fd[d] : xlim;
406              y = x - d;
407
408              if (ylim < y)
409                {
410                  x = ylim + d;
411                  y = ylim;
412                }
413              if (fxybest < x + y)
414                {
415                  fxybest = x + y;
416                  fxbest = x;
417                }
418            }
419          /* Find backward diagonal that minimizes X + Y.  */
420          bxybest = INT_MAX;
421          for (d = bmax; d >= bmin; d -= 2)
422            {
423              int x;
424              int y;
425
426              x = xoff > bd[d] ? xoff : bd[d];
427              y = x - d;
428
429              if (y < yoff)
430                {
431                  x = yoff + d;
432                  y = yoff;
433                }
434              if (x + y < bxybest)
435                {
436                  bxybest = x + y;
437                  bxbest = x;
438                }
439            }
440          /* Use the better of the two diagonals.  */
441          if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
442            {
443              part->xmid = fxbest;
444              part->ymid = fxybest - fxbest;
445              part->lo_minimal = 1;
446              part->hi_minimal = 0;
447            }
448          else
449            {
450              part->xmid = bxbest;
451              part->ymid = bxybest - bxbest;
452              part->lo_minimal = 0;
453              part->hi_minimal = 1;
454            }
455          return 2 * c - 1;
456        }
457    }
458}
459
460
461/* NAME
462        compareseq - find edit sequence
463
464   SYNOPSIS
465        void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
466
467   DESCRIPTION
468        Compare in detail contiguous subsequences of the two strings
469        which are known, as a whole, to match each other.
470
471        The subsequence of string 0 is [XOFF, XLIM) and likewise for
472        string 1.
473
474        Note that XLIM, YLIM are exclusive bounds.  All character
475        numbers are origin-0.
476
477        If MINIMAL is nonzero, find a minimal difference no matter how
478        expensive it is.  */
479
480static void compareseq PARAMS ((int, int, int, int, int));
481
482static void
483compareseq (xoff, xlim, yoff, ylim, minimal)
484     int xoff;
485     int xlim;
486     int yoff;
487     int ylim;
488     int minimal;
489{
490  const char *const xv = string[0].data;        /* Help the compiler.  */
491  const char *const yv = string[1].data;
492
493  /* Slide down the bottom initial diagonal. */
494  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
495    {
496      ++xoff;
497      ++yoff;
498    }
499
500  /* Slide up the top initial diagonal. */
501  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
502    {
503      --xlim;
504      --ylim;
505    }
506
507  /* Handle simple cases. */
508  if (xoff == xlim)
509    {
510      while (yoff < ylim)
511        {
512          ++string[1].edit_count;
513          ++yoff;
514        }
515    }
516  else if (yoff == ylim)
517    {
518      while (xoff < xlim)
519        {
520          ++string[0].edit_count;
521          ++xoff;
522        }
523    }
524  else
525    {
526      int c;
527      struct partition part;
528
529      /* Find a point of correspondence in the middle of the strings.  */
530      c = diag (xoff, xlim, yoff, ylim, minimal, &part);
531      if (c == 1)
532        {
533#if 0
534          /* This should be impossible, because it implies that one of
535             the two subsequences is empty, and that case was handled
536             above without calling `diag'.  Let's verify that this is
537             true.  */
538          abort ();
539#else
540          /* The two subsequences differ by a single insert or delete;
541             record it and we are done.  */
542          if (part.xmid - part.ymid < xoff - yoff)
543            ++string[1].edit_count;
544          else
545            ++string[0].edit_count;
546#endif
547        }
548      else
549        {
550          /* Use the partitions to split this problem into subproblems.  */
551          compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
552          compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
553        }
554    }
555}
556
557
558/* NAME
559        fstrcmp - fuzzy string compare
560
561   SYNOPSIS
562        double fstrcmp(const char *, const char *);
563
564   DESCRIPTION
565        The fstrcmp function may be used to compare two string for
566        similarity.  It is very useful in reducing "cascade" or
567        "secondary" errors in compilers or other situations where
568        symbol tables occur.
569
570   RETURNS
571        double; 0 if the strings are entirly dissimilar, 1 if the
572        strings are identical, and a number in between if they are
573        similar.  */
574
575double
576fstrcmp (string1, string2)
577     const char *string1;
578     const char *string2;
579{
580  int i;
581
582  size_t fdiag_len;
583  static int *fdiag_buf;
584  static size_t fdiag_max;
585
586  /* set the info for each string.  */
587  string[0].data = string1;
588  string[0].data_length = strlen (string1);
589  string[1].data = string2;
590  string[1].data_length = strlen (string2);
591
592  /* short-circuit obvious comparisons */
593  if (string[0].data_length == 0 && string[1].data_length == 0)
594    return 1.0;
595  if (string[0].data_length == 0 || string[1].data_length == 0)
596    return 0.0;
597
598  /* Set TOO_EXPENSIVE to be approximate square root of input size,
599     bounded below by 256.  */
600  too_expensive = 1;
601  for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
602    too_expensive <<= 1;
603  if (too_expensive < 256)
604    too_expensive = 256;
605
606  /* Because fstrcmp is typically called multiple times, while scanning
607     symbol tables, etc, attempt to minimize the number of memory
608     allocations performed.  Thus, we use a static buffer for the
609     diagonal vectors, and never free them.  */
610  fdiag_len = string[0].data_length + string[1].data_length + 3;
611  if (fdiag_len > fdiag_max)
612    {
613      fdiag_max = fdiag_len;
614      fdiag_buf = xrealloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
615    }
616  fdiag = fdiag_buf + string[1].data_length + 1;
617  bdiag = fdiag + fdiag_len;
618
619  /* Now do the main comparison algorithm */
620  string[0].edit_count = 0;
621  string[1].edit_count = 0;
622  compareseq (0, string[0].data_length, 0, string[1].data_length, 0);
623
624  /* The result is
625        ((number of chars in common) / (average length of the strings)).
626     This is admittedly biased towards finding that the strings are
627     similar, however it does produce meaningful results.  */
628  return ((double) (string[0].data_length + string[1].data_length -
629    string[1].edit_count - string[0].edit_count) / (string[0].data_length
630    + string[1].data_length));
631}
Note: See TracBrowser for help on using the repository browser.