1 | /* Analyze file differences for GNU DIFF. |
---|
2 | Copyright (C) 1988, 1989, 1992, 1993 Free Software Foundation, Inc. |
---|
3 | |
---|
4 | This file is part of GNU DIFF. |
---|
5 | |
---|
6 | GNU DIFF is free software; you can redistribute it and/or modify |
---|
7 | it under the terms of the GNU General Public License as published by |
---|
8 | the Free Software Foundation; either version 2, or (at your option) |
---|
9 | any later version. |
---|
10 | |
---|
11 | GNU DIFF is distributed in the hope that it will be useful, |
---|
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
14 | GNU General Public License for more details. |
---|
15 | |
---|
16 | You should have received a copy of the GNU General Public License |
---|
17 | along with GNU DIFF; see the file COPYING. If not, write to |
---|
18 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ |
---|
19 | |
---|
20 | /* The basic algorithm is described in: |
---|
21 | "An O(ND) Difference Algorithm and its Variations", Eugene Myers, |
---|
22 | Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; |
---|
23 | see especially section 4.2, which describes the variation used below. |
---|
24 | Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE |
---|
25 | heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) |
---|
26 | at the price of producing suboptimal output for large inputs with |
---|
27 | many differences. |
---|
28 | |
---|
29 | The basic algorithm was independently discovered as described in: |
---|
30 | "Algorithms for Approximate String Matching", E. Ukkonen, |
---|
31 | Information and Control Vol. 64, 1985, pp. 100-118. */ |
---|
32 | |
---|
33 | #include "diff.h" |
---|
34 | #include "cmpbuf.h" |
---|
35 | |
---|
36 | extern int no_discards; |
---|
37 | |
---|
38 | static int *xvec, *yvec; /* Vectors being compared. */ |
---|
39 | static int *fdiag; /* Vector, indexed by diagonal, containing |
---|
40 | 1 + the X coordinate of the point furthest |
---|
41 | along the given diagonal in the forward |
---|
42 | search of the edit matrix. */ |
---|
43 | static int *bdiag; /* Vector, indexed by diagonal, containing |
---|
44 | the X coordinate of the point furthest |
---|
45 | along the given diagonal in the backward |
---|
46 | search of the edit matrix. */ |
---|
47 | static int too_expensive; /* Edit scripts longer than this are too |
---|
48 | expensive to compute. */ |
---|
49 | |
---|
50 | #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */ |
---|
51 | |
---|
52 | struct partition |
---|
53 | { |
---|
54 | int xmid, ymid; /* Midpoints of this partition. */ |
---|
55 | int lo_minimal; /* Nonzero if low half will be analyzed minimally. */ |
---|
56 | int hi_minimal; /* Likewise for high half. */ |
---|
57 | }; |
---|
58 | |
---|
59 | static int diag PARAMS((int, int, int, int, int, struct partition *)); |
---|
60 | static struct change *add_change PARAMS((int, int, int, int, struct change *)); |
---|
61 | static struct change *build_reverse_script PARAMS((struct file_data const[])); |
---|
62 | static struct change *build_script PARAMS((struct file_data const[])); |
---|
63 | static void briefly_report PARAMS((int, struct file_data const[])); |
---|
64 | static void compareseq PARAMS((int, int, int, int, int)); |
---|
65 | static void discard_confusing_lines PARAMS((struct file_data[])); |
---|
66 | static void shift_boundaries PARAMS((struct file_data[])); |
---|
67 | |
---|
68 | /* Find the midpoint of the shortest edit script for a specified |
---|
69 | portion of the two files. |
---|
70 | |
---|
71 | Scan from the beginnings of the files, and simultaneously from the ends, |
---|
72 | doing a breadth-first search through the space of edit-sequence. |
---|
73 | When the two searches meet, we have found the midpoint of the shortest |
---|
74 | edit sequence. |
---|
75 | |
---|
76 | If MINIMAL is nonzero, find the minimal edit script regardless |
---|
77 | of expense. Otherwise, if the search is too expensive, use |
---|
78 | heuristics to stop the search and report a suboptimal answer. |
---|
79 | |
---|
80 | Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal number |
---|
81 | XMID - YMID equals the number of inserted lines minus the number |
---|
82 | of deleted lines (counting only lines before the midpoint). |
---|
83 | Return the approximate edit cost; this is the total number of |
---|
84 | lines inserted or deleted (counting only lines before the midpoint), |
---|
85 | unless a heuristic is used to terminate the search prematurely. |
---|
86 | |
---|
87 | Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script for the |
---|
88 | left half of the partition is known; similarly for PART->RIGHT_MINIMAL. |
---|
89 | |
---|
90 | This function assumes that the first lines of the specified portions |
---|
91 | of the two files do not match, and likewise that the last lines do not |
---|
92 | match. The caller must trim matching lines from the beginning and end |
---|
93 | of the portions it is going to specify. |
---|
94 | |
---|
95 | If we return the "wrong" partitions, |
---|
96 | the worst this can do is cause suboptimal diff output. |
---|
97 | It cannot cause incorrect diff output. */ |
---|
98 | |
---|
99 | static int |
---|
100 | diag (xoff, xlim, yoff, ylim, minimal, part) |
---|
101 | int xoff, xlim, yoff, ylim, minimal; |
---|
102 | struct partition *part; |
---|
103 | { |
---|
104 | int *const fd = fdiag; /* Give the compiler a chance. */ |
---|
105 | int *const bd = bdiag; /* Additional help for the compiler. */ |
---|
106 | int const *const xv = xvec; /* Still more help for the compiler. */ |
---|
107 | int const *const yv = yvec; /* And more and more . . . */ |
---|
108 | int const dmin = xoff - ylim; /* Minimum valid diagonal. */ |
---|
109 | int const dmax = xlim - yoff; /* Maximum valid diagonal. */ |
---|
110 | int const fmid = xoff - yoff; /* Center diagonal of top-down search. */ |
---|
111 | int const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ |
---|
112 | int fmin = fmid, fmax = fmid; /* Limits of top-down search. */ |
---|
113 | int bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */ |
---|
114 | int c; /* Cost. */ |
---|
115 | int odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd |
---|
116 | diagonal with respect to the northwest. */ |
---|
117 | |
---|
118 | fd[fmid] = xoff; |
---|
119 | bd[bmid] = xlim; |
---|
120 | |
---|
121 | for (c = 1;; ++c) |
---|
122 | { |
---|
123 | int d; /* Active diagonal. */ |
---|
124 | int big_snake = 0; |
---|
125 | |
---|
126 | /* Extend the top-down search by an edit step in each diagonal. */ |
---|
127 | fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin; |
---|
128 | fmax < dmax ? fd[++fmax + 1] = -1 : --fmax; |
---|
129 | for (d = fmax; d >= fmin; d -= 2) |
---|
130 | { |
---|
131 | int x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1]; |
---|
132 | |
---|
133 | if (tlo >= thi) |
---|
134 | x = tlo + 1; |
---|
135 | else |
---|
136 | x = thi; |
---|
137 | oldx = x; |
---|
138 | y = x - d; |
---|
139 | while (x < xlim && y < ylim && xv[x] == yv[y]) |
---|
140 | ++x, ++y; |
---|
141 | if (x - oldx > SNAKE_LIMIT) |
---|
142 | big_snake = 1; |
---|
143 | fd[d] = x; |
---|
144 | if (odd && bmin <= d && d <= bmax && bd[d] <= x) |
---|
145 | { |
---|
146 | part->xmid = x; |
---|
147 | part->ymid = y; |
---|
148 | part->lo_minimal = part->hi_minimal = 1; |
---|
149 | return 2 * c - 1; |
---|
150 | } |
---|
151 | } |
---|
152 | |
---|
153 | /* Similarly extend the bottom-up search. */ |
---|
154 | bmin > dmin ? bd[--bmin - 1] = INT_MAX : ++bmin; |
---|
155 | bmax < dmax ? bd[++bmax + 1] = INT_MAX : --bmax; |
---|
156 | for (d = bmax; d >= bmin; d -= 2) |
---|
157 | { |
---|
158 | int x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1]; |
---|
159 | |
---|
160 | if (tlo < thi) |
---|
161 | x = tlo; |
---|
162 | else |
---|
163 | x = thi - 1; |
---|
164 | oldx = x; |
---|
165 | y = x - d; |
---|
166 | while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) |
---|
167 | --x, --y; |
---|
168 | if (oldx - x > SNAKE_LIMIT) |
---|
169 | big_snake = 1; |
---|
170 | bd[d] = x; |
---|
171 | if (!odd && fmin <= d && d <= fmax && x <= fd[d]) |
---|
172 | { |
---|
173 | part->xmid = x; |
---|
174 | part->ymid = y; |
---|
175 | part->lo_minimal = part->hi_minimal = 1; |
---|
176 | return 2 * c; |
---|
177 | } |
---|
178 | } |
---|
179 | |
---|
180 | if (minimal) |
---|
181 | continue; |
---|
182 | |
---|
183 | /* Heuristic: check occasionally for a diagonal that has made |
---|
184 | lots of progress compared with the edit distance. |
---|
185 | If we have any such, find the one that has made the most |
---|
186 | progress and return it as if it had succeeded. |
---|
187 | |
---|
188 | With this heuristic, for files with a constant small density |
---|
189 | of changes, the algorithm is linear in the file size. */ |
---|
190 | |
---|
191 | if (c > 200 && big_snake && heuristic) |
---|
192 | { |
---|
193 | int best; |
---|
194 | |
---|
195 | best = 0; |
---|
196 | for (d = fmax; d >= fmin; d -= 2) |
---|
197 | { |
---|
198 | int dd = d - fmid; |
---|
199 | int x = fd[d]; |
---|
200 | int y = x - d; |
---|
201 | int v = (x - xoff) * 2 - dd; |
---|
202 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) |
---|
203 | { |
---|
204 | if (v > best |
---|
205 | && xoff + SNAKE_LIMIT <= x && x < xlim |
---|
206 | && yoff + SNAKE_LIMIT <= y && y < ylim) |
---|
207 | { |
---|
208 | /* We have a good enough best diagonal; |
---|
209 | now insist that it end with a significant snake. */ |
---|
210 | int k; |
---|
211 | |
---|
212 | for (k = 1; xv[x - k] == yv[y - k]; k++) |
---|
213 | if (k == SNAKE_LIMIT) |
---|
214 | { |
---|
215 | best = v; |
---|
216 | part->xmid = x; |
---|
217 | part->ymid = y; |
---|
218 | break; |
---|
219 | } |
---|
220 | } |
---|
221 | } |
---|
222 | } |
---|
223 | if (best > 0) |
---|
224 | { |
---|
225 | part->lo_minimal = 1; |
---|
226 | part->hi_minimal = 0; |
---|
227 | return 2 * c - 1; |
---|
228 | } |
---|
229 | |
---|
230 | best = 0; |
---|
231 | for (d = bmax; d >= bmin; d -= 2) |
---|
232 | { |
---|
233 | int dd = d - bmid; |
---|
234 | int x = bd[d]; |
---|
235 | int y = x - d; |
---|
236 | int v = (xlim - x) * 2 + dd; |
---|
237 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) |
---|
238 | { |
---|
239 | if (v > best |
---|
240 | && xoff < x && x <= xlim - SNAKE_LIMIT |
---|
241 | && yoff < y && y <= ylim - SNAKE_LIMIT) |
---|
242 | { |
---|
243 | /* We have a good enough best diagonal; |
---|
244 | now insist that it end with a significant snake. */ |
---|
245 | int k; |
---|
246 | |
---|
247 | for (k = 0; xv[x + k] == yv[y + k]; k++) |
---|
248 | if (k == SNAKE_LIMIT - 1) |
---|
249 | { |
---|
250 | best = v; |
---|
251 | part->xmid = x; |
---|
252 | part->ymid = y; |
---|
253 | break; |
---|
254 | } |
---|
255 | } |
---|
256 | } |
---|
257 | } |
---|
258 | if (best > 0) |
---|
259 | { |
---|
260 | part->lo_minimal = 0; |
---|
261 | part->hi_minimal = 1; |
---|
262 | return 2 * c - 1; |
---|
263 | } |
---|
264 | } |
---|
265 | |
---|
266 | /* Heuristic: if we've gone well beyond the call of duty, |
---|
267 | give up and report halfway between our best results so far. */ |
---|
268 | if (c >= too_expensive) |
---|
269 | { |
---|
270 | int fxybest, fxbest; |
---|
271 | int bxybest, bxbest; |
---|
272 | |
---|
273 | fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */ |
---|
274 | |
---|
275 | /* Find forward diagonal that maximizes X + Y. */ |
---|
276 | fxybest = -1; |
---|
277 | for (d = fmax; d >= fmin; d -= 2) |
---|
278 | { |
---|
279 | int x = min (fd[d], xlim); |
---|
280 | int y = x - d; |
---|
281 | if (ylim < y) |
---|
282 | x = ylim + d, y = ylim; |
---|
283 | if (fxybest < x + y) |
---|
284 | { |
---|
285 | fxybest = x + y; |
---|
286 | fxbest = x; |
---|
287 | } |
---|
288 | } |
---|
289 | |
---|
290 | /* Find backward diagonal that minimizes X + Y. */ |
---|
291 | bxybest = INT_MAX; |
---|
292 | for (d = bmax; d >= bmin; d -= 2) |
---|
293 | { |
---|
294 | int x = max (xoff, bd[d]); |
---|
295 | int y = x - d; |
---|
296 | if (y < yoff) |
---|
297 | x = yoff + d, y = yoff; |
---|
298 | if (x + y < bxybest) |
---|
299 | { |
---|
300 | bxybest = x + y; |
---|
301 | bxbest = x; |
---|
302 | } |
---|
303 | } |
---|
304 | |
---|
305 | /* Use the better of the two diagonals. */ |
---|
306 | if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) |
---|
307 | { |
---|
308 | part->xmid = fxbest; |
---|
309 | part->ymid = fxybest - fxbest; |
---|
310 | part->lo_minimal = 1; |
---|
311 | part->hi_minimal = 0; |
---|
312 | } |
---|
313 | else |
---|
314 | { |
---|
315 | part->xmid = bxbest; |
---|
316 | part->ymid = bxybest - bxbest; |
---|
317 | part->lo_minimal = 0; |
---|
318 | part->hi_minimal = 1; |
---|
319 | } |
---|
320 | return 2 * c - 1; |
---|
321 | } |
---|
322 | } |
---|
323 | } |
---|
324 | |
---|
325 | /* Compare in detail contiguous subsequences of the two files |
---|
326 | which are known, as a whole, to match each other. |
---|
327 | |
---|
328 | The results are recorded in the vectors files[N].changed_flag, by |
---|
329 | storing a 1 in the element for each line that is an insertion or deletion. |
---|
330 | |
---|
331 | The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. |
---|
332 | |
---|
333 | Note that XLIM, YLIM are exclusive bounds. |
---|
334 | All line numbers are origin-0 and discarded lines are not counted. |
---|
335 | |
---|
336 | If MINIMAL is nonzero, find a minimal difference no matter how |
---|
337 | expensive it is. */ |
---|
338 | |
---|
339 | static void |
---|
340 | compareseq (xoff, xlim, yoff, ylim, minimal) |
---|
341 | int xoff, xlim, yoff, ylim, minimal; |
---|
342 | { |
---|
343 | int * const xv = xvec; /* Help the compiler. */ |
---|
344 | int * const yv = yvec; |
---|
345 | |
---|
346 | /* Slide down the bottom initial diagonal. */ |
---|
347 | while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) |
---|
348 | ++xoff, ++yoff; |
---|
349 | /* Slide up the top initial diagonal. */ |
---|
350 | while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) |
---|
351 | --xlim, --ylim; |
---|
352 | |
---|
353 | /* Handle simple cases. */ |
---|
354 | if (xoff == xlim) |
---|
355 | while (yoff < ylim) |
---|
356 | files[1].changed_flag[files[1].realindexes[yoff++]] = 1; |
---|
357 | else if (yoff == ylim) |
---|
358 | while (xoff < xlim) |
---|
359 | files[0].changed_flag[files[0].realindexes[xoff++]] = 1; |
---|
360 | else |
---|
361 | { |
---|
362 | int c; |
---|
363 | struct partition part; |
---|
364 | |
---|
365 | /* Find a point of correspondence in the middle of the files. */ |
---|
366 | |
---|
367 | c = diag (xoff, xlim, yoff, ylim, minimal, &part); |
---|
368 | |
---|
369 | if (c == 1) |
---|
370 | { |
---|
371 | /* This should be impossible, because it implies that |
---|
372 | one of the two subsequences is empty, |
---|
373 | and that case was handled above without calling `diag'. |
---|
374 | Let's verify that this is true. */ |
---|
375 | abort (); |
---|
376 | #if 0 |
---|
377 | /* The two subsequences differ by a single insert or delete; |
---|
378 | record it and we are done. */ |
---|
379 | if (part.xmid - part.ymid < xoff - yoff) |
---|
380 | files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1; |
---|
381 | else |
---|
382 | files[0].changed_flag[files[0].realindexes[part.xmid]] = 1; |
---|
383 | #endif |
---|
384 | } |
---|
385 | else |
---|
386 | { |
---|
387 | /* Use the partitions to split this problem into subproblems. */ |
---|
388 | compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal); |
---|
389 | compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal); |
---|
390 | } |
---|
391 | } |
---|
392 | } |
---|
393 | |
---|
394 | /* Discard lines from one file that have no matches in the other file. |
---|
395 | |
---|
396 | A line which is discarded will not be considered by the actual |
---|
397 | comparison algorithm; it will be as if that line were not in the file. |
---|
398 | The file's `realindexes' table maps virtual line numbers |
---|
399 | (which don't count the discarded lines) into real line numbers; |
---|
400 | this is how the actual comparison algorithm produces results |
---|
401 | that are comprehensible when the discarded lines are counted. |
---|
402 | |
---|
403 | When we discard a line, we also mark it as a deletion or insertion |
---|
404 | so that it will be printed in the output. */ |
---|
405 | |
---|
406 | static void |
---|
407 | discard_confusing_lines (filevec) |
---|
408 | struct file_data filevec[]; |
---|
409 | { |
---|
410 | unsigned int f, i; |
---|
411 | char *discarded[2]; |
---|
412 | int *equiv_count[2]; |
---|
413 | int *p; |
---|
414 | |
---|
415 | /* Allocate our results. */ |
---|
416 | p = (int *) xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines) |
---|
417 | * (2 * sizeof (int))); |
---|
418 | for (f = 0; f < 2; f++) |
---|
419 | { |
---|
420 | filevec[f].undiscarded = p; p += filevec[f].buffered_lines; |
---|
421 | filevec[f].realindexes = p; p += filevec[f].buffered_lines; |
---|
422 | } |
---|
423 | |
---|
424 | /* Set up equiv_count[F][I] as the number of lines in file F |
---|
425 | that fall in equivalence class I. */ |
---|
426 | |
---|
427 | p = (int *) xmalloc (filevec[0].equiv_max * (2 * sizeof (int))); |
---|
428 | equiv_count[0] = p; |
---|
429 | equiv_count[1] = p + filevec[0].equiv_max; |
---|
430 | bzero (p, filevec[0].equiv_max * (2 * sizeof (int))); |
---|
431 | |
---|
432 | for (i = 0; i < filevec[0].buffered_lines; ++i) |
---|
433 | ++equiv_count[0][filevec[0].equivs[i]]; |
---|
434 | for (i = 0; i < filevec[1].buffered_lines; ++i) |
---|
435 | ++equiv_count[1][filevec[1].equivs[i]]; |
---|
436 | |
---|
437 | /* Set up tables of which lines are going to be discarded. */ |
---|
438 | |
---|
439 | discarded[0] = xmalloc (sizeof (char) |
---|
440 | * (filevec[0].buffered_lines |
---|
441 | + filevec[1].buffered_lines)); |
---|
442 | discarded[1] = discarded[0] + filevec[0].buffered_lines; |
---|
443 | bzero (discarded[0], sizeof (char) * (filevec[0].buffered_lines |
---|
444 | + filevec[1].buffered_lines)); |
---|
445 | |
---|
446 | /* Mark to be discarded each line that matches no line of the other file. |
---|
447 | If a line matches many lines, mark it as provisionally discardable. */ |
---|
448 | |
---|
449 | for (f = 0; f < 2; f++) |
---|
450 | { |
---|
451 | unsigned int end = filevec[f].buffered_lines; |
---|
452 | char *discards = discarded[f]; |
---|
453 | int *counts = equiv_count[1 - f]; |
---|
454 | int *equivs = filevec[f].equivs; |
---|
455 | unsigned int many = 5; |
---|
456 | unsigned int tem = end / 64; |
---|
457 | |
---|
458 | /* Multiply MANY by approximate square root of number of lines. |
---|
459 | That is the threshold for provisionally discardable lines. */ |
---|
460 | while ((tem = tem >> 2) > 0) |
---|
461 | many *= 2; |
---|
462 | |
---|
463 | for (i = 0; i < end; i++) |
---|
464 | { |
---|
465 | int nmatch; |
---|
466 | if (equivs[i] == 0) |
---|
467 | continue; |
---|
468 | nmatch = counts[equivs[i]]; |
---|
469 | if (nmatch == 0) |
---|
470 | discards[i] = 1; |
---|
471 | else if (nmatch > many) |
---|
472 | discards[i] = 2; |
---|
473 | } |
---|
474 | } |
---|
475 | |
---|
476 | /* Don't really discard the provisional lines except when they occur |
---|
477 | in a run of discardables, with nonprovisionals at the beginning |
---|
478 | and end. */ |
---|
479 | |
---|
480 | for (f = 0; f < 2; f++) |
---|
481 | { |
---|
482 | unsigned int end = filevec[f].buffered_lines; |
---|
483 | register char *discards = discarded[f]; |
---|
484 | |
---|
485 | for (i = 0; i < end; i++) |
---|
486 | { |
---|
487 | /* Cancel provisional discards not in middle of run of discards. */ |
---|
488 | if (discards[i] == 2) |
---|
489 | discards[i] = 0; |
---|
490 | else if (discards[i] != 0) |
---|
491 | { |
---|
492 | /* We have found a nonprovisional discard. */ |
---|
493 | register int j; |
---|
494 | unsigned int length; |
---|
495 | unsigned int provisional = 0; |
---|
496 | |
---|
497 | /* Find end of this run of discardable lines. |
---|
498 | Count how many are provisionally discardable. */ |
---|
499 | for (j = i; j < end; j++) |
---|
500 | { |
---|
501 | if (discards[j] == 0) |
---|
502 | break; |
---|
503 | if (discards[j] == 2) |
---|
504 | ++provisional; |
---|
505 | } |
---|
506 | |
---|
507 | /* Cancel provisional discards at end, and shrink the run. */ |
---|
508 | while (j > i && discards[j - 1] == 2) |
---|
509 | discards[--j] = 0, --provisional; |
---|
510 | |
---|
511 | /* Now we have the length of a run of discardable lines |
---|
512 | whose first and last are not provisional. */ |
---|
513 | length = j - i; |
---|
514 | |
---|
515 | /* If 1/4 of the lines in the run are provisional, |
---|
516 | cancel discarding of all provisional lines in the run. */ |
---|
517 | if (provisional * 4 > length) |
---|
518 | { |
---|
519 | while (j > i) |
---|
520 | if (discards[--j] == 2) |
---|
521 | discards[j] = 0; |
---|
522 | } |
---|
523 | else |
---|
524 | { |
---|
525 | register unsigned int consec; |
---|
526 | unsigned int minimum = 1; |
---|
527 | unsigned int tem = length / 4; |
---|
528 | |
---|
529 | /* MINIMUM is approximate square root of LENGTH/4. |
---|
530 | A subrun of two or more provisionals can stand |
---|
531 | when LENGTH is at least 16. |
---|
532 | A subrun of 4 or more can stand when LENGTH >= 64. */ |
---|
533 | while ((tem = tem >> 2) > 0) |
---|
534 | minimum *= 2; |
---|
535 | minimum++; |
---|
536 | |
---|
537 | /* Cancel any subrun of MINIMUM or more provisionals |
---|
538 | within the larger run. */ |
---|
539 | for (j = 0, consec = 0; j < length; j++) |
---|
540 | if (discards[i + j] != 2) |
---|
541 | consec = 0; |
---|
542 | else if (minimum == ++consec) |
---|
543 | /* Back up to start of subrun, to cancel it all. */ |
---|
544 | j -= consec; |
---|
545 | else if (minimum < consec) |
---|
546 | discards[i + j] = 0; |
---|
547 | |
---|
548 | /* Scan from beginning of run |
---|
549 | until we find 3 or more nonprovisionals in a row |
---|
550 | or until the first nonprovisional at least 8 lines in. |
---|
551 | Until that point, cancel any provisionals. */ |
---|
552 | for (j = 0, consec = 0; j < length; j++) |
---|
553 | { |
---|
554 | if (j >= 8 && discards[i + j] == 1) |
---|
555 | break; |
---|
556 | if (discards[i + j] == 2) |
---|
557 | consec = 0, discards[i + j] = 0; |
---|
558 | else if (discards[i + j] == 0) |
---|
559 | consec = 0; |
---|
560 | else |
---|
561 | consec++; |
---|
562 | if (consec == 3) |
---|
563 | break; |
---|
564 | } |
---|
565 | |
---|
566 | /* I advances to the last line of the run. */ |
---|
567 | i += length - 1; |
---|
568 | |
---|
569 | /* Same thing, from end. */ |
---|
570 | for (j = 0, consec = 0; j < length; j++) |
---|
571 | { |
---|
572 | if (j >= 8 && discards[i - j] == 1) |
---|
573 | break; |
---|
574 | if (discards[i - j] == 2) |
---|
575 | consec = 0, discards[i - j] = 0; |
---|
576 | else if (discards[i - j] == 0) |
---|
577 | consec = 0; |
---|
578 | else |
---|
579 | consec++; |
---|
580 | if (consec == 3) |
---|
581 | break; |
---|
582 | } |
---|
583 | } |
---|
584 | } |
---|
585 | } |
---|
586 | } |
---|
587 | |
---|
588 | /* Actually discard the lines. */ |
---|
589 | for (f = 0; f < 2; f++) |
---|
590 | { |
---|
591 | char *discards = discarded[f]; |
---|
592 | unsigned int end = filevec[f].buffered_lines; |
---|
593 | unsigned int j = 0; |
---|
594 | for (i = 0; i < end; ++i) |
---|
595 | if (no_discards || discards[i] == 0) |
---|
596 | { |
---|
597 | filevec[f].undiscarded[j] = filevec[f].equivs[i]; |
---|
598 | filevec[f].realindexes[j++] = i; |
---|
599 | } |
---|
600 | else |
---|
601 | filevec[f].changed_flag[i] = 1; |
---|
602 | filevec[f].nondiscarded_lines = j; |
---|
603 | } |
---|
604 | |
---|
605 | free (discarded[0]); |
---|
606 | free (equiv_count[0]); |
---|
607 | } |
---|
608 | |
---|
609 | /* Adjust inserts/deletes of identical lines to join changes |
---|
610 | as much as possible. |
---|
611 | |
---|
612 | We do something when a run of changed lines include a |
---|
613 | line at one end and have an excluded, identical line at the other. |
---|
614 | We are free to choose which identical line is included. |
---|
615 | `compareseq' usually chooses the one at the beginning, |
---|
616 | but usually it is cleaner to consider the following identical line |
---|
617 | to be the "change". */ |
---|
618 | |
---|
619 | int inhibit; |
---|
620 | |
---|
621 | static void |
---|
622 | shift_boundaries (filevec) |
---|
623 | struct file_data filevec[]; |
---|
624 | { |
---|
625 | int f; |
---|
626 | |
---|
627 | if (inhibit) |
---|
628 | return; |
---|
629 | |
---|
630 | for (f = 0; f < 2; f++) |
---|
631 | { |
---|
632 | char *changed = filevec[f].changed_flag; |
---|
633 | char const *other_changed = filevec[1-f].changed_flag; |
---|
634 | int const *equivs = filevec[f].equivs; |
---|
635 | int i = 0; |
---|
636 | int j = 0; |
---|
637 | int i_end = filevec[f].buffered_lines; |
---|
638 | |
---|
639 | while (1) |
---|
640 | { |
---|
641 | int runlength, start, corresponding; |
---|
642 | |
---|
643 | /* Scan forwards to find beginning of another run of changes. |
---|
644 | Also keep track of the corresponding point in the other file. */ |
---|
645 | |
---|
646 | while (i < i_end && changed[i] == 0) |
---|
647 | { |
---|
648 | while (other_changed[j++]) |
---|
649 | continue; |
---|
650 | i++; |
---|
651 | } |
---|
652 | |
---|
653 | if (i == i_end) |
---|
654 | break; |
---|
655 | |
---|
656 | start = i; |
---|
657 | |
---|
658 | /* Find the end of this run of changes. */ |
---|
659 | |
---|
660 | while (changed[++i]) |
---|
661 | continue; |
---|
662 | while (other_changed[j]) |
---|
663 | j++; |
---|
664 | |
---|
665 | do |
---|
666 | { |
---|
667 | /* Record the length of this run of changes, so that |
---|
668 | we can later determine whether the run has grown. */ |
---|
669 | runlength = i - start; |
---|
670 | |
---|
671 | /* Move the changed region back, so long as the |
---|
672 | previous unchanged line matches the last changed one. |
---|
673 | This merges with previous changed regions. */ |
---|
674 | |
---|
675 | while (start && equivs[start - 1] == equivs[i - 1]) |
---|
676 | { |
---|
677 | changed[--start] = 1; |
---|
678 | changed[--i] = 0; |
---|
679 | while (changed[start - 1]) |
---|
680 | start--; |
---|
681 | while (other_changed[--j]) |
---|
682 | continue; |
---|
683 | } |
---|
684 | |
---|
685 | /* Set CORRESPONDING to the end of the changed run, at the last |
---|
686 | point where it corresponds to a changed run in the other file. |
---|
687 | CORRESPONDING == I_END means no such point has been found. */ |
---|
688 | corresponding = other_changed[j - 1] ? i : i_end; |
---|
689 | |
---|
690 | /* Move the changed region forward, so long as the |
---|
691 | first changed line matches the following unchanged one. |
---|
692 | This merges with following changed regions. |
---|
693 | Do this second, so that if there are no merges, |
---|
694 | the changed region is moved forward as far as possible. */ |
---|
695 | |
---|
696 | while (i != i_end && equivs[start] == equivs[i]) |
---|
697 | { |
---|
698 | changed[start++] = 0; |
---|
699 | changed[i++] = 1; |
---|
700 | while (changed[i]) |
---|
701 | i++; |
---|
702 | while (other_changed[++j]) |
---|
703 | corresponding = i; |
---|
704 | } |
---|
705 | } |
---|
706 | while (runlength != i - start); |
---|
707 | |
---|
708 | /* If possible, move the fully-merged run of changes |
---|
709 | back to a corresponding run in the other file. */ |
---|
710 | |
---|
711 | while (corresponding < i) |
---|
712 | { |
---|
713 | changed[--start] = 1; |
---|
714 | changed[--i] = 0; |
---|
715 | while (other_changed[--j]) |
---|
716 | continue; |
---|
717 | } |
---|
718 | } |
---|
719 | } |
---|
720 | } |
---|
721 | |
---|
722 | /* Cons an additional entry onto the front of an edit script OLD. |
---|
723 | LINE0 and LINE1 are the first affected lines in the two files (origin 0). |
---|
724 | DELETED is the number of lines deleted here from file 0. |
---|
725 | INSERTED is the number of lines inserted here in file 1. |
---|
726 | |
---|
727 | If DELETED is 0 then LINE0 is the number of the line before |
---|
728 | which the insertion was done; vice versa for INSERTED and LINE1. */ |
---|
729 | |
---|
730 | static struct change * |
---|
731 | add_change (line0, line1, deleted, inserted, old) |
---|
732 | int line0, line1, deleted, inserted; |
---|
733 | struct change *old; |
---|
734 | { |
---|
735 | struct change *new = (struct change *) xmalloc (sizeof (struct change)); |
---|
736 | |
---|
737 | new->line0 = line0; |
---|
738 | new->line1 = line1; |
---|
739 | new->inserted = inserted; |
---|
740 | new->deleted = deleted; |
---|
741 | new->link = old; |
---|
742 | return new; |
---|
743 | } |
---|
744 | |
---|
745 | /* Scan the tables of which lines are inserted and deleted, |
---|
746 | producing an edit script in reverse order. */ |
---|
747 | |
---|
748 | static struct change * |
---|
749 | build_reverse_script (filevec) |
---|
750 | struct file_data const filevec[]; |
---|
751 | { |
---|
752 | struct change *script = 0; |
---|
753 | char *changed0 = filevec[0].changed_flag; |
---|
754 | char *changed1 = filevec[1].changed_flag; |
---|
755 | int len0 = filevec[0].buffered_lines; |
---|
756 | int len1 = filevec[1].buffered_lines; |
---|
757 | |
---|
758 | /* Note that changedN[len0] does exist, and contains 0. */ |
---|
759 | |
---|
760 | int i0 = 0, i1 = 0; |
---|
761 | |
---|
762 | while (i0 < len0 || i1 < len1) |
---|
763 | { |
---|
764 | if (changed0[i0] || changed1[i1]) |
---|
765 | { |
---|
766 | int line0 = i0, line1 = i1; |
---|
767 | |
---|
768 | /* Find # lines changed here in each file. */ |
---|
769 | while (changed0[i0]) ++i0; |
---|
770 | while (changed1[i1]) ++i1; |
---|
771 | |
---|
772 | /* Record this change. */ |
---|
773 | script = add_change (line0, line1, i0 - line0, i1 - line1, script); |
---|
774 | } |
---|
775 | |
---|
776 | /* We have reached lines in the two files that match each other. */ |
---|
777 | i0++, i1++; |
---|
778 | } |
---|
779 | |
---|
780 | return script; |
---|
781 | } |
---|
782 | |
---|
783 | /* Scan the tables of which lines are inserted and deleted, |
---|
784 | producing an edit script in forward order. */ |
---|
785 | |
---|
786 | static struct change * |
---|
787 | build_script (filevec) |
---|
788 | struct file_data const filevec[]; |
---|
789 | { |
---|
790 | struct change *script = 0; |
---|
791 | char *changed0 = filevec[0].changed_flag; |
---|
792 | char *changed1 = filevec[1].changed_flag; |
---|
793 | int i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines; |
---|
794 | |
---|
795 | /* Note that changedN[-1] does exist, and contains 0. */ |
---|
796 | |
---|
797 | while (i0 >= 0 || i1 >= 0) |
---|
798 | { |
---|
799 | if (changed0[i0 - 1] || changed1[i1 - 1]) |
---|
800 | { |
---|
801 | int line0 = i0, line1 = i1; |
---|
802 | |
---|
803 | /* Find # lines changed here in each file. */ |
---|
804 | while (changed0[i0 - 1]) --i0; |
---|
805 | while (changed1[i1 - 1]) --i1; |
---|
806 | |
---|
807 | /* Record this change. */ |
---|
808 | script = add_change (i0, i1, line0 - i0, line1 - i1, script); |
---|
809 | } |
---|
810 | |
---|
811 | /* We have reached lines in the two files that match each other. */ |
---|
812 | i0--, i1--; |
---|
813 | } |
---|
814 | |
---|
815 | return script; |
---|
816 | } |
---|
817 | |
---|
818 | /* If CHANGES, briefly report that two files differed. */ |
---|
819 | static void |
---|
820 | briefly_report (changes, filevec) |
---|
821 | int changes; |
---|
822 | struct file_data const filevec[]; |
---|
823 | { |
---|
824 | if (changes) |
---|
825 | message (no_details_flag ? "Files %s and %s differ\n" |
---|
826 | : "Binary files %s and %s differ\n", |
---|
827 | filevec[0].name, filevec[1].name); |
---|
828 | } |
---|
829 | |
---|
830 | /* Report the differences of two files. DEPTH is the current directory |
---|
831 | depth. */ |
---|
832 | int |
---|
833 | diff_2_files (filevec, depth) |
---|
834 | struct file_data filevec[]; |
---|
835 | int depth; |
---|
836 | { |
---|
837 | int diags; |
---|
838 | int i; |
---|
839 | struct change *e, *p; |
---|
840 | struct change *script; |
---|
841 | int changes; |
---|
842 | |
---|
843 | |
---|
844 | /* If we have detected that either file is binary, |
---|
845 | compare the two files as binary. This can happen |
---|
846 | only when the first chunk is read. |
---|
847 | Also, --brief without any --ignore-* options means |
---|
848 | we can speed things up by treating the files as binary. */ |
---|
849 | |
---|
850 | if (read_files (filevec, no_details_flag & ~ignore_some_changes)) |
---|
851 | { |
---|
852 | /* Files with different lengths must be different. */ |
---|
853 | if (filevec[0].stat.st_size != filevec[1].stat.st_size |
---|
854 | && (filevec[0].desc < 0 || S_ISREG (filevec[0].stat.st_mode)) |
---|
855 | && (filevec[1].desc < 0 || S_ISREG (filevec[1].stat.st_mode))) |
---|
856 | changes = 1; |
---|
857 | |
---|
858 | /* Standard input equals itself. */ |
---|
859 | else if (filevec[0].desc == filevec[1].desc) |
---|
860 | changes = 0; |
---|
861 | |
---|
862 | else |
---|
863 | /* Scan both files, a buffer at a time, looking for a difference. */ |
---|
864 | { |
---|
865 | /* Allocate same-sized buffers for both files. */ |
---|
866 | size_t buffer_size = buffer_lcm (STAT_BLOCKSIZE (filevec[0].stat), |
---|
867 | STAT_BLOCKSIZE (filevec[1].stat)); |
---|
868 | for (i = 0; i < 2; i++) |
---|
869 | filevec[i].buffer = xrealloc (filevec[i].buffer, buffer_size); |
---|
870 | |
---|
871 | for (;; filevec[0].buffered_chars = filevec[1].buffered_chars = 0) |
---|
872 | { |
---|
873 | /* Read a buffer's worth from both files. */ |
---|
874 | for (i = 0; i < 2; i++) |
---|
875 | if (0 <= filevec[i].desc) |
---|
876 | while (filevec[i].buffered_chars != buffer_size) |
---|
877 | { |
---|
878 | int r = read (filevec[i].desc, |
---|
879 | filevec[i].buffer |
---|
880 | + filevec[i].buffered_chars, |
---|
881 | buffer_size - filevec[i].buffered_chars); |
---|
882 | if (r == 0) |
---|
883 | break; |
---|
884 | if (r < 0) |
---|
885 | pfatal_with_name (filevec[i].name); |
---|
886 | filevec[i].buffered_chars += r; |
---|
887 | } |
---|
888 | |
---|
889 | /* If the buffers differ, the files differ. */ |
---|
890 | if (filevec[0].buffered_chars != filevec[1].buffered_chars |
---|
891 | || (filevec[0].buffered_chars != 0 |
---|
892 | && memcmp (filevec[0].buffer, |
---|
893 | filevec[1].buffer, |
---|
894 | filevec[0].buffered_chars) != 0)) |
---|
895 | { |
---|
896 | changes = 1; |
---|
897 | break; |
---|
898 | } |
---|
899 | |
---|
900 | /* If we reach end of file, the files are the same. */ |
---|
901 | if (filevec[0].buffered_chars != buffer_size) |
---|
902 | { |
---|
903 | changes = 0; |
---|
904 | break; |
---|
905 | } |
---|
906 | } |
---|
907 | } |
---|
908 | |
---|
909 | briefly_report (changes, filevec); |
---|
910 | } |
---|
911 | else |
---|
912 | { |
---|
913 | /* Allocate vectors for the results of comparison: |
---|
914 | a flag for each line of each file, saying whether that line |
---|
915 | is an insertion or deletion. |
---|
916 | Allocate an extra element, always zero, at each end of each vector. */ |
---|
917 | |
---|
918 | size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4; |
---|
919 | filevec[0].changed_flag = xmalloc (s); |
---|
920 | bzero (filevec[0].changed_flag, s); |
---|
921 | filevec[0].changed_flag++; |
---|
922 | filevec[1].changed_flag = filevec[0].changed_flag |
---|
923 | + filevec[0].buffered_lines + 2; |
---|
924 | |
---|
925 | /* Some lines are obviously insertions or deletions |
---|
926 | because they don't match anything. Detect them now, and |
---|
927 | avoid even thinking about them in the main comparison algorithm. */ |
---|
928 | |
---|
929 | discard_confusing_lines (filevec); |
---|
930 | |
---|
931 | /* Now do the main comparison algorithm, considering just the |
---|
932 | undiscarded lines. */ |
---|
933 | |
---|
934 | xvec = filevec[0].undiscarded; |
---|
935 | yvec = filevec[1].undiscarded; |
---|
936 | diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; |
---|
937 | fdiag = (int *) xmalloc (diags * (2 * sizeof (int))); |
---|
938 | bdiag = fdiag + diags; |
---|
939 | fdiag += filevec[1].nondiscarded_lines + 1; |
---|
940 | bdiag += filevec[1].nondiscarded_lines + 1; |
---|
941 | |
---|
942 | /* Set TOO_EXPENSIVE to be approximate square root of input size, |
---|
943 | bounded below by 256. */ |
---|
944 | too_expensive = 1; |
---|
945 | for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines; |
---|
946 | i != 0; i >>= 2) |
---|
947 | too_expensive <<= 1; |
---|
948 | too_expensive = max (256, too_expensive); |
---|
949 | |
---|
950 | files[0] = filevec[0]; |
---|
951 | files[1] = filevec[1]; |
---|
952 | |
---|
953 | compareseq (0, filevec[0].nondiscarded_lines, |
---|
954 | 0, filevec[1].nondiscarded_lines, no_discards); |
---|
955 | |
---|
956 | free (fdiag - (filevec[1].nondiscarded_lines + 1)); |
---|
957 | |
---|
958 | /* Modify the results slightly to make them prettier |
---|
959 | in cases where that can validly be done. */ |
---|
960 | |
---|
961 | shift_boundaries (filevec); |
---|
962 | |
---|
963 | /* Get the results of comparison in the form of a chain |
---|
964 | of `struct change's -- an edit script. */ |
---|
965 | |
---|
966 | if (output_style == OUTPUT_ED) |
---|
967 | script = build_reverse_script (filevec); |
---|
968 | else |
---|
969 | script = build_script (filevec); |
---|
970 | |
---|
971 | /* Set CHANGES if we had any diffs. |
---|
972 | If some changes are ignored, we must scan the script to decide. */ |
---|
973 | if (ignore_blank_lines_flag || ignore_regexp_list) |
---|
974 | { |
---|
975 | struct change *next = script; |
---|
976 | changes = 0; |
---|
977 | |
---|
978 | while (next && changes == 0) |
---|
979 | { |
---|
980 | struct change *this, *end; |
---|
981 | int first0, last0, first1, last1, deletes, inserts; |
---|
982 | |
---|
983 | /* Find a set of changes that belong together. */ |
---|
984 | this = next; |
---|
985 | end = find_change (next); |
---|
986 | |
---|
987 | /* Disconnect them from the rest of the changes, making them |
---|
988 | a hunk, and remember the rest for next iteration. */ |
---|
989 | next = end->link; |
---|
990 | end->link = 0; |
---|
991 | |
---|
992 | /* Determine whether this hunk is really a difference. */ |
---|
993 | analyze_hunk (this, &first0, &last0, &first1, &last1, |
---|
994 | &deletes, &inserts); |
---|
995 | |
---|
996 | /* Reconnect the script so it will all be freed properly. */ |
---|
997 | end->link = next; |
---|
998 | |
---|
999 | if (deletes || inserts) |
---|
1000 | changes = 1; |
---|
1001 | } |
---|
1002 | } |
---|
1003 | else |
---|
1004 | changes = (script != 0); |
---|
1005 | |
---|
1006 | if (no_details_flag) |
---|
1007 | briefly_report (changes, filevec); |
---|
1008 | else |
---|
1009 | { |
---|
1010 | if (changes || ! no_diff_means_no_output) |
---|
1011 | { |
---|
1012 | /* Record info for starting up output, |
---|
1013 | to be used if and when we have some output to print. */ |
---|
1014 | setup_output (files[0].name, files[1].name, depth); |
---|
1015 | |
---|
1016 | switch (output_style) |
---|
1017 | { |
---|
1018 | case OUTPUT_CONTEXT: |
---|
1019 | print_context_script (script, 0); |
---|
1020 | break; |
---|
1021 | |
---|
1022 | case OUTPUT_UNIFIED: |
---|
1023 | print_context_script (script, 1); |
---|
1024 | break; |
---|
1025 | |
---|
1026 | case OUTPUT_ED: |
---|
1027 | print_ed_script (script); |
---|
1028 | break; |
---|
1029 | |
---|
1030 | case OUTPUT_FORWARD_ED: |
---|
1031 | pr_forward_ed_script (script); |
---|
1032 | break; |
---|
1033 | |
---|
1034 | case OUTPUT_RCS: |
---|
1035 | print_rcs_script (script); |
---|
1036 | break; |
---|
1037 | |
---|
1038 | case OUTPUT_NORMAL: |
---|
1039 | print_normal_script (script); |
---|
1040 | break; |
---|
1041 | |
---|
1042 | case OUTPUT_IFDEF: |
---|
1043 | print_ifdef_script (script); |
---|
1044 | break; |
---|
1045 | |
---|
1046 | case OUTPUT_SDIFF: |
---|
1047 | print_sdiff_script (script); |
---|
1048 | } |
---|
1049 | |
---|
1050 | finish_output (); |
---|
1051 | } |
---|
1052 | } |
---|
1053 | |
---|
1054 | free (filevec[0].undiscarded); |
---|
1055 | |
---|
1056 | free (filevec[0].changed_flag - 1); |
---|
1057 | |
---|
1058 | for (i = 1; i >= 0; --i) |
---|
1059 | free (filevec[i].equivs); |
---|
1060 | |
---|
1061 | for (i = 0; i < 2; ++i) |
---|
1062 | free (filevec[i].linbuf + filevec[i].linbuf_base); |
---|
1063 | |
---|
1064 | for (e = script; e; e = p) |
---|
1065 | { |
---|
1066 | p = e->link; |
---|
1067 | free (e); |
---|
1068 | } |
---|
1069 | |
---|
1070 | if (! ROBUST_OUTPUT_STYLE (output_style)) |
---|
1071 | for (i = 0; i < 2; ++i) |
---|
1072 | if (filevec[i].missing_newline) |
---|
1073 | { |
---|
1074 | error ("No newline at end of file %s", filevec[i].name, ""); |
---|
1075 | changes = 2; |
---|
1076 | } |
---|
1077 | } |
---|
1078 | |
---|
1079 | if (filevec[0].buffer != filevec[1].buffer) |
---|
1080 | free (filevec[0].buffer); |
---|
1081 | free (filevec[1].buffer); |
---|
1082 | |
---|
1083 | return changes; |
---|
1084 | } |
---|