Tim Peters writes:

Attached is a cleaned-up version of ndiff (added useful module
docstring, now echo'ed in case of cmd line mistake); added -q option
to suppress initial file identification lines; + other minor cleanups,
& a slightly faster match engine.
This commit is contained in:
Guido van Rossum 1999-03-27 13:34:01 +00:00
parent 806a467fbf
commit a3433e89eb

View file

@ -1,16 +1,50 @@
#! /usr/bin/env python
# Released to the public domain $JustDate: 3/16/98 $,
# by Tim Peters (email tim_one@email.msn.com).
# Module ndiff version 1.3.0
# Released to the public domain 26-Mar-1999,
# by Tim Peters (tim_one@email.msn.com).
# ndiff file1 file2 -- a human-friendly file differencer.
# Provided as-is; use at your own risk; no warranty; no promises; enjoy!
# $Revision$
"""ndiff [-q] file1 file2
Print a human-friendly file difference report to stdout. Both inter-
and intra-line differences are noted.
If -q ("quiet") is not specified, the first two lines of output are
-: file1
+: file2
Each remaining line begins with a two-letter code:
"- " line unique to file1
"+ " line unique to file2
" " line common to both files
"? " line not present in either input file
Lines beginning with "? " attempt to guide the eye to intraline
differences, and were not present in either input file.
The first file can be recovered by retaining only lines that begin with
" " or "- ", and deleting those 2-character prefixes.
The second file can be recovered similarly, but by retaining only " "
and "+ " lines. On Unix, the second file can be recovered by piping the
output through
sed -n '/^[+ ] /s/^..//p'
Modifications to recover the first file are left as an exercise for
the reader.
See module comments for details and programmatic interface.
"""
__version__ = 1, 3, 0
# SequenceMatcher tries to compute a "human-friendly diff" between
# two sequences (chiefly picturing a file as a sequence of lines,
# and a line as a sequence of characters, here). Unlike UNIX(tm) diff,
# e.g., the fundamental notion is the longest *contiguous* & junk-free
# and a line as a sequence of characters, here). Unlike e.g. UNIX(tm)
# diff, the fundamental notion is the longest *contiguous* & junk-free
# matching subsequence. That's what catches peoples' eyes. The
# Windows(tm) windiff has another interesting notion, pairing up elements
# that appear uniquely in each sequence. That, and the method here,
@ -26,11 +60,11 @@
# apart. Restricting synch points to contiguous matches preserves some
# notion of locality, at the occasional cost of producing a longer diff.
#
# With respect to junk, an earlier verion of ndiff simply refused to
# With respect to junk, an earlier version of ndiff simply refused to
# *start* a match with a junk element. The result was cases like this:
# before: private Thread currentThread;
# after: private volatile Thread currentThread;
# If you consider whitespace to be junk, the longest continguous match
# If you consider whitespace to be junk, the longest contiguous match
# not starting with junk is "e Thread currentThread". So ndiff reported
# that "e volatil" was inserted between the 't' and the 'e' in "private".
# While an accurate view, to people that's absurd. The current version
@ -40,23 +74,9 @@
# preceding blank; then "private" is matched, and extended to suck up the
# following blank; then "Thread" is matched; and finally ndiff reports
# that "volatile " was inserted before "Thread". The only quibble
# remaining is that perhaps it was really the case that " volative"
# remaining is that perhaps it was really the case that " volatile"
# was inserted after "private". I can live with that <wink>.
#
# NOTE on the output: From an ndiff report,
# 1) The first file can be recovered by retaining only lines that begin
# with " " or "- ", and deleting those 2-character prefixes.
# 2) The second file can be recovered similarly, but by retaining only
# " " and "+ " lines.
# 3) Lines beginning with "? " attempt to guide the eye to intraline
# differences, and were not present in either input file.
#
# COROLLARY:
# On Unix, the second file can be recovered by piping the output through
# sed -n '/^[+ ] /s/^..//p'
# Modifications to recover the first file are left as an exercise for
# the reader.
#
# NOTE on junk: the module-level names
# IS_LINE_JUNK
# IS_CHARACTER_JUNK
@ -70,8 +90,8 @@
#
# After setting those, you can call fcompare(f1name, f2name) with the
# names of the files you want to compare. The difference report
# is sent to stdout. Or you can call main(), which expects to find
# (exactly) the two file names in sys.argv.
# is sent to stdout. Or you can call main(args), passing what would
# have been in sys.argv[1:] had the cmd-line form been used.
import string
TRACE = 0
@ -148,7 +168,7 @@ def set_seq2(self, b):
self.fullbcount = None
self.__chain_b()
# for each element x in b, set b2j[x] to a list of the indices in
# For each element x in b, set b2j[x] to a list of the indices in
# b where x appears; the indices are in increasing order; note that
# the number of times x appears in b is len(b2j[x]) ...
# when self.isjunk is defined, junk elements don't show up in this
@ -173,7 +193,7 @@ def __chain_b(self):
b = self.b
self.b2j = b2j = {}
self.b2jhas = b2jhas = b2j.has_key
for i in xrange(0, len(b)):
for i in xrange(len(b)):
elt = b[i]
if b2jhas(elt):
b2j[elt].append(i)
@ -210,9 +230,9 @@ def find_longest_match(self, alo, ahi, blo, bhi):
k >= k'
i <= i'
and if i == i', j <= j'
In other words, of all maximal matching blocks, returns one
In other words, of all maximal matching blocks, return one
that starts earliest in a, and of all those maximal matching
blocks that start earliest in a, returns the one that starts
blocks that start earliest in a, return the one that starts
earliest in b.
If isjunk is defined, first the longest matching block is
@ -223,7 +243,7 @@ def find_longest_match(self, alo, ahi, blo, bhi):
as identical junk happens to be adjacent to an "interesting"
match.
If no blocks match, returns (alo, blo, 0).
If no blocks match, return (alo, blo, 0).
"""
# CAUTION: stripping common prefix or suffix would be incorrect.
@ -238,40 +258,28 @@ def find_longest_match(self, alo, ahi, blo, bhi):
# Windiff ends up at the same place as diff, but by pairing up
# the unique 'b's and then matching the first two 'a's.
# find longest junk-free match
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
besti, bestj, bestsize = alo, blo, 0
# find longest junk-free match
# during an iteration of the loop, j2len[j] = length of longest
# junk-free match ending with a[i-1] and b[j]
j2len = {}
nothing = []
for i in xrange(alo, ahi):
# check for longest match starting at a[i]
if i + bestsize >= ahi:
# we're too far right to get a new best
break
# look at all instances of a[i] in b; note that because
# b2j has no junk keys, the loop is skipped if a[i] is junk
for j in b2j.get(a[i], []):
j2lenget = j2len.get
newj2len = {}
for j in b2j.get(a[i], nothing):
# a[i] matches b[j]
if j < blo:
continue
if j + bestsize >= bhi:
# we're too far right to get a new best, here or
# anywhere to the right
if j >= bhi:
break
if a[i + bestsize] != b[j + bestsize]:
# can't be longer match; this test is not necessary
# for correctness, but is a huge win for efficiency
continue
# set k to length of match
k = 1 # a[i] == b[j] already known
while i + k < ahi and j + k < bhi and \
a[i+k] == b[j+k] and not isbjunk(b[j+k]):
k = k + 1
k = newj2len[j] = j2lenget(j-1, 0) + 1
if k > bestsize:
besti, bestj, bestsize = i, j, k
if i + bestsize >= ahi:
# only time in my life I really wanted a
# labelled break <wink> -- we're done with
# both loops now
break
besti, bestj, bestsize = i-k+1, j-k+1, k
j2len = newj2len
# Now that we have a wholly interesting match (albeit possibly
# empty!), we may as well suck up the matching junk on each
@ -294,101 +302,6 @@ def find_longest_match(self, alo, ahi, blo, bhi):
print " returns", besti, bestj, bestsize
return besti, bestj, bestsize
# A different implementation, using a binary doubling technique that
# does far fewer element compares (trades 'em for integer compares),
# and has n*lg n worst-case behavior. Alas, the code is much harder
# to follow (the details are tricky!), and in most cases I've seen,
# it takes at least 50% longer than the "clever dumb" method above;
# probably due to creating layers of small dicts.
# NOTE: this no longer matches the version above wrt junk; remains
# too unpromising to update it; someday, though ...
# def find_longest_match(self, alo, ahi, blo, bhi):
# """Find longest matching block in a[alo:ahi] and b[blo:bhi].
#
# Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
# alo <= i <= i+k <= ahi
# blo <= j <= j+k <= bhi
# and for all (i',j',k') meeting those conditions,
# k >= k'
# i <= i'
# and if i == i', j <= j'
# In other words, of all maximal matching blocks, returns one
# that starts earliest in a, and of all those maximal matching
# blocks that start earliest in a, returns the one that starts
# earliest in b.
#
# If no blocks match, returns (alo, blo, 0).
# """
#
# a, b2j = self.a, self.b2j
# # alljs[size][i] is a set of all j's s.t. a[i:i+len] matches
# # b[j:j+len]
# alljs = {}
# alljs[1] = js = {}
# ahits = {}
# for i in xrange(alo, ahi):
# elt = a[i]
# if ahits.has_key(elt):
# js[i] = ahits[elt]
# continue
# if b2j.has_key(elt):
# in_range = {}
# for j in b2j[elt]:
# if j >= blo:
# if j >= bhi:
# break
# in_range[j] = 1
# if in_range:
# ahits[elt] = js[i] = in_range
# del ahits
# size = 1
# while js:
# oldsize = size
# size = size + size
# oldjs = js
# alljs[size] = js = {}
# for i in oldjs.keys():
# # i has matches of size oldsize
# if not oldjs.has_key(i + oldsize):
# # can't double it
# continue
# second_js = oldjs[i + oldsize]
# answer = {}
# for j in oldjs[i].keys():
# if second_js.has_key(j + oldsize):
# answer[j] = 1
# if answer:
# js[i] = answer
# del alljs[size]
# size = size >> 1 # max power of 2 with a match
# if not size:
# return alo, blo, 0
# besti, bestj, bestsize = alo, blo, 0
# fatis = alljs[size].keys()
# fatis.sort()
# for i in fatis:
# # figure out longest match starting at a[i]
# totalsize = halfsize = size
# # i has matches of len totalsize at the indices in js
# js = alljs[size][i].keys()
# while halfsize > 1:
# halfsize = halfsize >> 1
# # is there a match of len halfsize starting at
# # i + totalsize?
# newjs = []
# if alljs[halfsize].has_key(i + totalsize):
# second_js = alljs[halfsize][i + totalsize]
# for j in js:
# if second_js.has_key(j + totalsize):
# newjs.append(j)
# if newjs:
# totalsize = totalsize + halfsize
# js = newjs
# if totalsize > bestsize:
# besti, bestj, bestsize = i, min(js), totalsize
# return besti, bestj, bestsize
def get_matching_blocks(self):
if self.matching_blocks is not None:
return self.matching_blocks
@ -621,7 +534,7 @@ def fopen(fname):
try:
return open(fname, 'r')
except IOError, detail:
print "couldn't open " + fname + ": " + `detail`
print "couldn't open " + fname + ": " + str(detail)
return 0
# open two files & spray the diff to stdout; return false iff a problem
@ -649,24 +562,39 @@ def fcompare(f1name, f2name):
return 1
# get file names from argv & compare; return false iff a problem
def main():
from sys import argv
if len(argv) != 3:
print 'need 2 args'
# crack args (sys.argv[1:] is normal) & compare;
# return false iff a problem
def main(args):
import getopt
try:
opts, args = getopt.getopt(args, "q")
except getopt.error, detail:
print str(detail)
print __doc__
return 0
[f1name, f2name] = argv[1:3]
print '-:', f1name
print '+:', f2name
noisy = 1
for opt, val in opts:
if opt == "-q":
noisy = 0
if len(args) != 2:
print 'need 2 args'
print __doc__
return 0
f1name, f2name = args
if noisy:
print '-:', f1name
print '+:', f2name
return fcompare(f1name, f2name)
if __name__ == '__main__':
import sys
args = sys.argv[1:]
if 1:
main()
main(args)
else:
import profile, pstats
statf = "ndiff.pro"
profile.run("main()", statf)
profile.run("main(args)", statf)
stats = pstats.Stats(statf)
stats.strip_dirs().sort_stats('time').print_stats()