mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	Tim Peters writes:
Attached is a cleaned-up version of ndiff (added useful module docstring, now echo'ed in case of cmd line mistake); added -q option to suppress initial file identification lines; + other minor cleanups, & a slightly faster match engine.
This commit is contained in:
		
							parent
							
								
									806a467fbf
								
							
						
					
					
						commit
						a3433e89eb
					
				
					 1 changed files with 89 additions and 161 deletions
				
			
		|  | @ -1,16 +1,50 @@ | ||||||
| #! /usr/bin/env python | #! /usr/bin/env python | ||||||
| 
 | 
 | ||||||
| # Released to the public domain $JustDate:  3/16/98 $, | # Module ndiff version 1.3.0 | ||||||
| # by Tim Peters (email tim_one@email.msn.com). | # Released to the public domain 26-Mar-1999, | ||||||
|  | # by Tim Peters (tim_one@email.msn.com). | ||||||
| 
 | 
 | ||||||
| # ndiff file1 file2 -- a human-friendly file differencer. | # Provided as-is; use at your own risk; no warranty; no promises; enjoy! | ||||||
| 
 | 
 | ||||||
| # $Revision$ | """ndiff [-q] file1 file2 | ||||||
|  | 
 | ||||||
|  | Print a human-friendly file difference report to stdout.  Both inter- | ||||||
|  | and intra-line differences are noted. | ||||||
|  | 
 | ||||||
|  | If -q ("quiet") is not specified, the first two lines of output are | ||||||
|  | 
 | ||||||
|  | -: file1 | ||||||
|  | +: file2 | ||||||
|  | 
 | ||||||
|  | Each remaining line begins with a two-letter code: | ||||||
|  | 
 | ||||||
|  |     "- "    line unique to file1 | ||||||
|  |     "+ "    line unique to file2 | ||||||
|  |     "  "    line common to both files | ||||||
|  |     "? "    line not present in either input file | ||||||
|  | 
 | ||||||
|  | Lines beginning with "? " attempt to guide the eye to intraline | ||||||
|  | differences, and were not present in either input file. | ||||||
|  | 
 | ||||||
|  | The first file can be recovered by retaining only lines that begin with | ||||||
|  | "  " or "- ", and deleting those 2-character prefixes. | ||||||
|  | 
 | ||||||
|  | The second file can be recovered similarly, but by retaining only "  " | ||||||
|  | and "+ " lines.  On Unix, the second file can be recovered by piping the | ||||||
|  | output through | ||||||
|  |     sed -n '/^[+ ] /s/^..//p' | ||||||
|  | Modifications to recover the first file are left as an exercise for | ||||||
|  | the reader. | ||||||
|  | 
 | ||||||
|  | See module comments for details and programmatic interface. | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | __version__ = 1, 3, 0 | ||||||
| 
 | 
 | ||||||
| # SequenceMatcher tries to compute a "human-friendly diff" between | # SequenceMatcher tries to compute a "human-friendly diff" between | ||||||
| # two sequences (chiefly picturing a file as a sequence of lines, | # two sequences (chiefly picturing a file as a sequence of lines, | ||||||
| # and a line as a sequence of characters, here).  Unlike UNIX(tm) diff, | # and a line as a sequence of characters, here).  Unlike e.g. UNIX(tm) | ||||||
| # e.g., the fundamental notion is the longest *contiguous* & junk-free | # diff, the fundamental notion is the longest *contiguous* & junk-free | ||||||
| # matching subsequence.  That's what catches peoples' eyes.  The | # matching subsequence.  That's what catches peoples' eyes.  The | ||||||
| # Windows(tm) windiff has another interesting notion, pairing up elements | # Windows(tm) windiff has another interesting notion, pairing up elements | ||||||
| # that appear uniquely in each sequence.  That, and the method here, | # that appear uniquely in each sequence.  That, and the method here, | ||||||
|  | @ -26,11 +60,11 @@ | ||||||
| # apart.  Restricting synch points to contiguous matches preserves some | # apart.  Restricting synch points to contiguous matches preserves some | ||||||
| # notion of locality, at the occasional cost of producing a longer diff. | # notion of locality, at the occasional cost of producing a longer diff. | ||||||
| # | # | ||||||
| # With respect to junk, an earlier verion of ndiff simply refused to | # With respect to junk, an earlier version of ndiff simply refused to | ||||||
| # *start* a match with a junk element.  The result was cases like this: | # *start* a match with a junk element.  The result was cases like this: | ||||||
| #     before: private Thread currentThread; | #     before: private Thread currentThread; | ||||||
| #     after:  private volatile Thread currentThread; | #     after:  private volatile Thread currentThread; | ||||||
| # If you consider whitespace to be junk, the longest continguous match | # If you consider whitespace to be junk, the longest contiguous match | ||||||
| # not starting with junk is "e Thread currentThread".  So ndiff reported | # not starting with junk is "e Thread currentThread".  So ndiff reported | ||||||
| # that "e volatil" was inserted between the 't' and the 'e' in "private". | # that "e volatil" was inserted between the 't' and the 'e' in "private". | ||||||
| # While an accurate view, to people that's absurd.  The current version | # While an accurate view, to people that's absurd.  The current version | ||||||
|  | @ -40,23 +74,9 @@ | ||||||
| # preceding blank; then "private" is matched, and extended to suck up the | # preceding blank; then "private" is matched, and extended to suck up the | ||||||
| # following blank; then "Thread" is matched; and finally ndiff reports | # following blank; then "Thread" is matched; and finally ndiff reports | ||||||
| # that "volatile " was inserted before "Thread".  The only quibble | # that "volatile " was inserted before "Thread".  The only quibble | ||||||
| # remaining is that perhaps it was really the case that " volative" | # remaining is that perhaps it was really the case that " volatile" | ||||||
| # was inserted after "private".  I can live with that <wink>. | # was inserted after "private".  I can live with that <wink>. | ||||||
| # | # | ||||||
| # NOTE on the output:  From an ndiff report, |  | ||||||
| # 1) The first file can be recovered by retaining only lines that begin |  | ||||||
| #    with "  " or "- ", and deleting those 2-character prefixes. |  | ||||||
| # 2) The second file can be recovered similarly, but by retaining only |  | ||||||
| #    "  " and "+ " lines. |  | ||||||
| # 3) Lines beginning with "? " attempt to guide the eye to intraline |  | ||||||
| #    differences, and were not present in either input file. |  | ||||||
| # |  | ||||||
| # COROLLARY: |  | ||||||
| # On Unix, the second file can be recovered by piping the output through |  | ||||||
| #    sed -n '/^[+ ] /s/^..//p' |  | ||||||
| # Modifications to recover the first file are left as an exercise for |  | ||||||
| # the reader. |  | ||||||
| # |  | ||||||
| # NOTE on junk:  the module-level names | # NOTE on junk:  the module-level names | ||||||
| #    IS_LINE_JUNK | #    IS_LINE_JUNK | ||||||
| #    IS_CHARACTER_JUNK | #    IS_CHARACTER_JUNK | ||||||
|  | @ -70,8 +90,8 @@ | ||||||
| # | # | ||||||
| # After setting those, you can call fcompare(f1name, f2name) with the | # After setting those, you can call fcompare(f1name, f2name) with the | ||||||
| # names of the files you want to compare.  The difference report | # names of the files you want to compare.  The difference report | ||||||
| # is sent to stdout.  Or you can call main(), which expects to find | # is sent to stdout.  Or you can call main(args), passing what would | ||||||
| # (exactly) the two file names in sys.argv. | # have been in sys.argv[1:] had the cmd-line form been used. | ||||||
| 
 | 
 | ||||||
| import string | import string | ||||||
| TRACE = 0 | TRACE = 0 | ||||||
|  | @ -148,7 +168,7 @@ def set_seq2(self, b): | ||||||
|         self.fullbcount = None |         self.fullbcount = None | ||||||
|         self.__chain_b() |         self.__chain_b() | ||||||
| 
 | 
 | ||||||
|     # for each element x in b, set b2j[x] to a list of the indices in |     # For each element x in b, set b2j[x] to a list of the indices in | ||||||
|     # b where x appears; the indices are in increasing order; note that |     # b where x appears; the indices are in increasing order; note that | ||||||
|     # the number of times x appears in b is len(b2j[x]) ... |     # the number of times x appears in b is len(b2j[x]) ... | ||||||
|     # when self.isjunk is defined, junk elements don't show up in this |     # when self.isjunk is defined, junk elements don't show up in this | ||||||
|  | @ -173,7 +193,7 @@ def __chain_b(self): | ||||||
|         b = self.b |         b = self.b | ||||||
|         self.b2j = b2j = {} |         self.b2j = b2j = {} | ||||||
|         self.b2jhas = b2jhas = b2j.has_key |         self.b2jhas = b2jhas = b2j.has_key | ||||||
|         for i in xrange(0, len(b)): |         for i in xrange(len(b)): | ||||||
|             elt = b[i] |             elt = b[i] | ||||||
|             if b2jhas(elt): |             if b2jhas(elt): | ||||||
|                 b2j[elt].append(i) |                 b2j[elt].append(i) | ||||||
|  | @ -210,9 +230,9 @@ def find_longest_match(self, alo, ahi, blo, bhi): | ||||||
|             k >= k' |             k >= k' | ||||||
|             i <= i' |             i <= i' | ||||||
|             and if i == i', j <= j' |             and if i == i', j <= j' | ||||||
|         In other words, of all maximal matching blocks, returns one |         In other words, of all maximal matching blocks, return one | ||||||
|         that starts earliest in a, and of all those maximal matching |         that starts earliest in a, and of all those maximal matching | ||||||
|         blocks that start earliest in a, returns the one that starts |         blocks that start earliest in a, return the one that starts | ||||||
|         earliest in b. |         earliest in b. | ||||||
| 
 | 
 | ||||||
|         If isjunk is defined, first the longest matching block is |         If isjunk is defined, first the longest matching block is | ||||||
|  | @ -223,7 +243,7 @@ def find_longest_match(self, alo, ahi, blo, bhi): | ||||||
|         as identical junk happens to be adjacent to an "interesting" |         as identical junk happens to be adjacent to an "interesting" | ||||||
|         match. |         match. | ||||||
| 
 | 
 | ||||||
|         If no blocks match, returns (alo, blo, 0). |         If no blocks match, return (alo, blo, 0). | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         # CAUTION:  stripping common prefix or suffix would be incorrect. |         # CAUTION:  stripping common prefix or suffix would be incorrect. | ||||||
|  | @ -238,40 +258,28 @@ def find_longest_match(self, alo, ahi, blo, bhi): | ||||||
|         # Windiff ends up at the same place as diff, but by pairing up |         # Windiff ends up at the same place as diff, but by pairing up | ||||||
|         # the unique 'b's and then matching the first two 'a's. |         # the unique 'b's and then matching the first two 'a's. | ||||||
| 
 | 
 | ||||||
|         # find longest junk-free match |  | ||||||
|         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk |         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk | ||||||
|         besti, bestj, bestsize = alo, blo, 0 |         besti, bestj, bestsize = alo, blo, 0 | ||||||
|  |         # find longest junk-free match | ||||||
|  |         # during an iteration of the loop, j2len[j] = length of longest | ||||||
|  |         # junk-free match ending with a[i-1] and b[j] | ||||||
|  |         j2len = {} | ||||||
|  |         nothing = [] | ||||||
|         for i in xrange(alo, ahi): |         for i in xrange(alo, ahi): | ||||||
|             # check for longest match starting at a[i] |  | ||||||
|             if i + bestsize >= ahi: |  | ||||||
|                 # we're too far right to get a new best |  | ||||||
|                 break |  | ||||||
|             # look at all instances of a[i] in b; note that because |             # look at all instances of a[i] in b; note that because | ||||||
|             # b2j has no junk keys, the loop is skipped if a[i] is junk |             # b2j has no junk keys, the loop is skipped if a[i] is junk | ||||||
|             for j in b2j.get(a[i], []): |             j2lenget = j2len.get | ||||||
|  |             newj2len = {} | ||||||
|  |             for j in b2j.get(a[i], nothing): | ||||||
|                 # a[i] matches b[j] |                 # a[i] matches b[j] | ||||||
|                 if j < blo: |                 if j < blo: | ||||||
|                     continue |                     continue | ||||||
|                 if j + bestsize >= bhi: |                 if j >= bhi: | ||||||
|                     # we're too far right to get a new best, here or |  | ||||||
|                     # anywhere to the right |  | ||||||
|                     break |                     break | ||||||
|                 if a[i + bestsize] != b[j + bestsize]: |                 k = newj2len[j] = j2lenget(j-1, 0) + 1 | ||||||
|                     # can't be longer match; this test is not necessary |  | ||||||
|                     # for correctness, but is a huge win for efficiency |  | ||||||
|                     continue |  | ||||||
|                 # set k to length of match |  | ||||||
|                 k = 1   # a[i] == b[j] already known |  | ||||||
|                 while i + k < ahi and j + k < bhi and \ |  | ||||||
|                       a[i+k] == b[j+k] and not isbjunk(b[j+k]): |  | ||||||
|                     k = k + 1 |  | ||||||
|                 if k > bestsize: |                 if k > bestsize: | ||||||
|                     besti, bestj, bestsize = i, j, k |                     besti, bestj, bestsize = i-k+1, j-k+1, k | ||||||
|                     if i + bestsize >= ahi: |             j2len = newj2len | ||||||
|                         # only time in my life I really wanted a |  | ||||||
|                         # labelled break <wink> -- we're done with |  | ||||||
|                         # both loops now |  | ||||||
|                         break |  | ||||||
| 
 | 
 | ||||||
|         # Now that we have a wholly interesting match (albeit possibly |         # Now that we have a wholly interesting match (albeit possibly | ||||||
|         # empty!), we may as well suck up the matching junk on each |         # empty!), we may as well suck up the matching junk on each | ||||||
|  | @ -294,101 +302,6 @@ def find_longest_match(self, alo, ahi, blo, bhi): | ||||||
|             print "    returns", besti, bestj, bestsize |             print "    returns", besti, bestj, bestsize | ||||||
|         return besti, bestj, bestsize |         return besti, bestj, bestsize | ||||||
| 
 | 
 | ||||||
| #   A different implementation, using a binary doubling technique that |  | ||||||
| #   does far fewer element compares (trades 'em for integer compares), |  | ||||||
| #   and has n*lg n worst-case behavior.  Alas, the code is much harder |  | ||||||
| #   to follow (the details are tricky!), and in most cases I've seen, |  | ||||||
| #   it takes at least 50% longer than the "clever dumb" method above; |  | ||||||
| #   probably due to creating layers of small dicts. |  | ||||||
| #   NOTE:  this no longer matches the version above wrt junk; remains |  | ||||||
| #   too unpromising to update it; someday, though ... |  | ||||||
| 
 |  | ||||||
| #    def find_longest_match(self, alo, ahi, blo, bhi): |  | ||||||
| #        """Find longest matching block in a[alo:ahi] and b[blo:bhi]. |  | ||||||
| # |  | ||||||
| #        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where |  | ||||||
| #            alo <= i <= i+k <= ahi |  | ||||||
| #            blo <= j <= j+k <= bhi |  | ||||||
| #        and for all (i',j',k') meeting those conditions, |  | ||||||
| #            k >= k' |  | ||||||
| #            i <= i' |  | ||||||
| #            and if i == i', j <= j' |  | ||||||
| #        In other words, of all maximal matching blocks, returns one |  | ||||||
| #        that starts earliest in a, and of all those maximal matching |  | ||||||
| #        blocks that start earliest in a, returns the one that starts |  | ||||||
| #        earliest in b. |  | ||||||
| # |  | ||||||
| #        If no blocks match, returns (alo, blo, 0). |  | ||||||
| #        """ |  | ||||||
| # |  | ||||||
| #        a, b2j = self.a, self.b2j |  | ||||||
| #        # alljs[size][i] is a set of all j's s.t. a[i:i+len] matches |  | ||||||
| #        # b[j:j+len] |  | ||||||
| #        alljs = {} |  | ||||||
| #        alljs[1] = js = {} |  | ||||||
| #        ahits = {} |  | ||||||
| #        for i in xrange(alo, ahi): |  | ||||||
| #            elt = a[i] |  | ||||||
| #            if ahits.has_key(elt): |  | ||||||
| #                js[i] = ahits[elt] |  | ||||||
| #                continue |  | ||||||
| #            if b2j.has_key(elt): |  | ||||||
| #                in_range = {} |  | ||||||
| #                for j in b2j[elt]: |  | ||||||
| #                    if j >= blo: |  | ||||||
| #                        if j >= bhi: |  | ||||||
| #                            break |  | ||||||
| #                        in_range[j] = 1 |  | ||||||
| #                if in_range: |  | ||||||
| #                    ahits[elt] = js[i] = in_range |  | ||||||
| #        del ahits |  | ||||||
| #        size = 1 |  | ||||||
| #        while js: |  | ||||||
| #            oldsize = size |  | ||||||
| #            size = size + size |  | ||||||
| #            oldjs = js |  | ||||||
| #            alljs[size] = js = {} |  | ||||||
| #            for i in oldjs.keys(): |  | ||||||
| #                # i has matches of size oldsize |  | ||||||
| #                if not oldjs.has_key(i + oldsize): |  | ||||||
| #                    # can't double it |  | ||||||
| #                    continue |  | ||||||
| #                second_js = oldjs[i + oldsize] |  | ||||||
| #                answer = {} |  | ||||||
| #                for j in oldjs[i].keys(): |  | ||||||
| #                    if second_js.has_key(j + oldsize): |  | ||||||
| #                        answer[j] = 1 |  | ||||||
| #                if answer: |  | ||||||
| #                    js[i] = answer |  | ||||||
| #        del alljs[size] |  | ||||||
| #        size = size >> 1    # max power of 2 with a match |  | ||||||
| #        if not size: |  | ||||||
| #            return alo, blo, 0 |  | ||||||
| #        besti, bestj, bestsize = alo, blo, 0 |  | ||||||
| #        fatis = alljs[size].keys() |  | ||||||
| #        fatis.sort() |  | ||||||
| #        for i in fatis: |  | ||||||
| #            # figure out longest match starting at a[i] |  | ||||||
| #            totalsize = halfsize = size |  | ||||||
| #            # i has matches of len totalsize at the indices in js |  | ||||||
| #            js = alljs[size][i].keys() |  | ||||||
| #            while halfsize > 1: |  | ||||||
| #                halfsize = halfsize >> 1 |  | ||||||
| #                # is there a match of len halfsize starting at |  | ||||||
| #                # i + totalsize? |  | ||||||
| #                newjs = [] |  | ||||||
| #                if alljs[halfsize].has_key(i + totalsize): |  | ||||||
| #                    second_js = alljs[halfsize][i + totalsize] |  | ||||||
| #                    for j in js: |  | ||||||
| #                        if second_js.has_key(j + totalsize): |  | ||||||
| #                            newjs.append(j) |  | ||||||
| #                if newjs: |  | ||||||
| #                    totalsize = totalsize + halfsize |  | ||||||
| #                    js = newjs |  | ||||||
| #            if totalsize > bestsize: |  | ||||||
| #                besti, bestj, bestsize = i, min(js), totalsize |  | ||||||
| #        return besti, bestj, bestsize |  | ||||||
| 
 |  | ||||||
|     def get_matching_blocks(self): |     def get_matching_blocks(self): | ||||||
|         if self.matching_blocks is not None: |         if self.matching_blocks is not None: | ||||||
|             return self.matching_blocks |             return self.matching_blocks | ||||||
|  | @ -621,7 +534,7 @@ def fopen(fname): | ||||||
|     try: |     try: | ||||||
|         return open(fname, 'r') |         return open(fname, 'r') | ||||||
|     except IOError, detail: |     except IOError, detail: | ||||||
|         print "couldn't open " + fname + ": " + `detail` |         print "couldn't open " + fname + ": " + str(detail) | ||||||
|         return 0 |         return 0 | ||||||
| 
 | 
 | ||||||
| # open two files & spray the diff to stdout; return false iff a problem | # open two files & spray the diff to stdout; return false iff a problem | ||||||
|  | @ -649,24 +562,39 @@ def fcompare(f1name, f2name): | ||||||
| 
 | 
 | ||||||
|     return 1 |     return 1 | ||||||
| 
 | 
 | ||||||
| # get file names from argv & compare; return false iff a problem | # crack args (sys.argv[1:] is normal) & compare; | ||||||
| def main(): | # return false iff a problem | ||||||
|     from sys import argv | 
 | ||||||
|     if len(argv) != 3: | def main(args): | ||||||
|         print 'need 2 args' |     import getopt | ||||||
|  |     try: | ||||||
|  |         opts, args = getopt.getopt(args, "q") | ||||||
|  |     except getopt.error, detail: | ||||||
|  |         print str(detail) | ||||||
|  |         print __doc__ | ||||||
|         return 0 |         return 0 | ||||||
|     [f1name, f2name] = argv[1:3] |     noisy = 1 | ||||||
|     print '-:', f1name |     for opt, val in opts: | ||||||
|     print '+:', f2name |         if opt == "-q": | ||||||
|  |             noisy = 0 | ||||||
|  |     if len(args) != 2: | ||||||
|  |         print 'need 2 args' | ||||||
|  |         print __doc__ | ||||||
|  |         return 0 | ||||||
|  |     f1name, f2name = args | ||||||
|  |     if noisy: | ||||||
|  |         print '-:', f1name | ||||||
|  |         print '+:', f2name | ||||||
|     return fcompare(f1name, f2name) |     return fcompare(f1name, f2name) | ||||||
| 
 | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|  |     import sys | ||||||
|  |     args = sys.argv[1:] | ||||||
|     if 1: |     if 1: | ||||||
|         main() |         main(args) | ||||||
|     else: |     else: | ||||||
|         import profile, pstats |         import profile, pstats | ||||||
|         statf = "ndiff.pro" |         statf = "ndiff.pro" | ||||||
|         profile.run("main()", statf) |         profile.run("main(args)", statf) | ||||||
|         stats = pstats.Stats(statf) |         stats = pstats.Stats(statf) | ||||||
|         stats.strip_dirs().sort_stats('time').print_stats() |         stats.strip_dirs().sort_stats('time').print_stats() | ||||||
| 
 |  | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Guido van Rossum
						Guido van Rossum