1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#! /usr/bin/env python
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								"""A variant on webchecker that creates a mirror copy of a remote site."""
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1998-02-21 20:08:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								__version__ = "$Revision$"
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import os
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import sys
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import urllib
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import getopt
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1999-11-17 15:40:48 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import webchecker
							 | 
						
					
						
							
								
									
										
										
										
											1998-02-21 20:08:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								# Extract real version number if necessary
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								if __version__[0] == '$':
							 | 
						
					
						
							
								
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    _v = __version__.split()
							 | 
						
					
						
							
								
									
										
										
										
											1998-02-21 20:08:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if len(_v) == 3:
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        __version__ = _v[1]
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def main():
							 | 
						
					
						
							
								
									
										
										
										
											1998-02-21 20:08:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    verbose = webchecker.VERBOSE
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    try:
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        opts, args = getopt.getopt(sys.argv[1:], "qv")
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    except getopt.error, msg:
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print msg
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return 2
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for o, a in opts:
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if o == "-q":
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            verbose = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if o == "-v":
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            verbose = verbose + 1
							 | 
						
					
						
							
								
									
										
										
										
											1998-02-21 20:08:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    c = Sucker()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    c.setflags(verbose=verbose)
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    c.urlopener.addheaders = [
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            ('User-agent', 'websucker/%s' % __version__),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ]
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for arg in args:
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print "Adding root", arg
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        c.addroot(arg)
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    print "Run..."
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    c.run()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								class Sucker(webchecker.Checker):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1998-02-21 20:08:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    checkext = 0
							 | 
						
					
						
							
								
									
										
										
										
											1999-11-17 15:04:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    nonames = 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # SAM 11/13/99: in general, URLs are now URL pairs.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # Since we've suppressed name anchor checking,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # we can ignore the second dimension.
							 | 
						
					
						
							
								
									
										
										
										
											1998-02-21 20:08:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1999-11-17 15:04:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    def readhtml(self, url_pair):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        url = url_pair[0]
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        text = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        path = self.savefilename(url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        try:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            f = open(path, "rb")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        except IOError:
							 | 
						
					
						
							
								
									
										
										
										
											1999-11-17 15:04:26 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            f = self.openpage(url_pair)
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if f:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                info = f.info()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                nurl = f.geturl()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if nurl != url:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    url = nurl
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    path = self.savefilename(url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                text = f.read()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                f.close()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                self.savefile(text, path)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if not self.checkforhtml(info, url):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    text = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if self.checkforhtml({}, url):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                text = f.read()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            f.close()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return text, url
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def savefile(self, text, path):
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        dir, base = os.path.split(path)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        makedirs(dir)
							 | 
						
					
						
							
								
									
										
										
										
											1999-01-03 13:06:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        try:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            f = open(path, "wb")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            f.write(text)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            f.close()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            self.message("saved %s", path)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        except IOError, msg:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            self.message("didn't save %s: %s", path, str(msg))
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def savefilename(self, url):
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        type, rest = urllib.splittype(url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        host, path = urllib.splithost(rest)
							 | 
						
					
						
							
								
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        path = path.lstrip("/")
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        user, host = urllib.splituser(host)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        host, port = urllib.splitnport(host)
							 | 
						
					
						
							
								
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        host = host.lower()
							 | 
						
					
						
							
								
									
										
										
										
											1998-06-15 12:34:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not path or path[-1] == "/":
							 | 
						
					
						
							
								
									
										
										
										
											2000-04-25 21:13:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            path = path + "index.html"
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if os.sep != "/":
							 | 
						
					
						
							
								
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            path = os.sep.join(path.split("/"))
							 | 
						
					
						
							
								
									
										
										
										
											2000-04-25 21:13:24 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if os.name == "mac":
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                path = os.sep + path
							 | 
						
					
						
							
								
									
										
										
										
											1998-06-15 12:34:41 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        path = os.path.join(host, path)
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return path
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def makedirs(dir):
							 | 
						
					
						
							
								
									
										
										
										
											1999-01-03 13:06:00 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not dir:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if os.path.exists(dir):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not os.path.isdir(dir):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            try:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                os.rename(dir, dir + ".bak")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                os.mkdir(dir)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                os.rename(dir + ".bak", os.path.join(dir, "index.html"))
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            except os.error:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                pass
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    head, tail = os.path.split(dir)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not tail:
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        print "Huh?  Don't know how to make dir", dir
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return
							 | 
						
					
						
							
								
									
										
										
										
											1997-10-06 18:54:25 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    makedirs(head)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    os.mkdir(dir, 0777)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								if __name__ == '__main__':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    sys.exit(main() or 0)
							 |