mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 05:31:20 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			131 lines
		
	
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			131 lines
		
	
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
| #! /usr/bin/env python
 | |
| 
 | |
| """A variant on webchecker that creates a mirror copy of a remote site."""
 | |
| 
 | |
| __version__ = "0.1"
 | |
| 
 | |
| import os
 | |
| import sys
 | |
| import string
 | |
| import urllib
 | |
| import getopt
 | |
| 
 | |
| import webchecker
 | |
| verbose = webchecker.verbose
 | |
| 
 | |
| def main():
 | |
|     global verbose
 | |
|     try:
 | |
| 	opts, args = getopt.getopt(sys.argv[1:], "qv")
 | |
|     except getopt.error, msg:
 | |
| 	print msg
 | |
| 	print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
 | |
| 	return 2
 | |
|     for o, a in opts:
 | |
| 	if o == "-q":
 | |
| 	    webchecker.verbose = verbose = 0
 | |
| 	if o == "-v":
 | |
| 	    webchecker.verbose = verbose = verbose + 1
 | |
|     c = Sucker(0)
 | |
|     c.urlopener.addheaders = [
 | |
| 	    ('User-agent', 'websucker/%s' % __version__),
 | |
| 	]
 | |
|     for arg in args:
 | |
| 	print "Adding root", arg
 | |
| 	c.addroot(arg)
 | |
|     print "Run..."
 | |
|     c.run()
 | |
| 
 | |
| class Sucker(webchecker.Checker):
 | |
| 
 | |
|     # Alas, had to copy this to make one change...
 | |
|     def getpage(self, url):
 | |
| 	if url[:7] == 'mailto:' or url[:5] == 'news:':
 | |
| 	    if verbose > 1: print " Not checking mailto/news URL"
 | |
| 	    return None
 | |
| 	isint = self.inroots(url)
 | |
| 	if not isint and not self.checkext:
 | |
| 	    if verbose > 1: print " Not checking ext link"
 | |
| 	    return None
 | |
| 	path = self.savefilename(url)
 | |
| 	saved = 0
 | |
| 	try:
 | |
| 	    f = open(path, "rb")
 | |
| 	except IOError:
 | |
| 	    try:
 | |
| 		f = self.urlopener.open(url)
 | |
| 	    except IOError, msg:
 | |
| 		msg = webchecker.sanitize(msg)
 | |
| 		if verbose > 0:
 | |
| 		    print "Error ", msg
 | |
| 		if verbose > 0:
 | |
| 		    webchecker.show(" HREF ", url, "  from", self.todo[url])
 | |
| 		self.setbad(url, msg)
 | |
| 		return None
 | |
| 	    if not isint:
 | |
| 		if verbose > 1: print " Not gathering links from ext URL"
 | |
| 		safeclose(f)
 | |
| 		return None
 | |
| 	    nurl = f.geturl()
 | |
| 	    if nurl != url:
 | |
| 		path = self.savefilename(nurl)
 | |
| 	    info = f.info()
 | |
| 	else:
 | |
| 	    if verbose: print "Loading cached URL", url
 | |
| 	    saved = 1
 | |
| 	    nurl = url
 | |
| 	    info = {}
 | |
| 	    if url[-1:] == "/":
 | |
| 		info["content-type"] = "text/html"
 | |
| 	text = f.read()
 | |
| 	if not saved: self.savefile(text, path)
 | |
| 	if info.has_key('content-type'):
 | |
| 	    ctype = string.lower(info['content-type'])
 | |
| 	else:
 | |
| 	    ctype = None
 | |
| 	if nurl != url:
 | |
| 	    if verbose > 1:
 | |
| 		print " Redirected to", nurl
 | |
| 	if not ctype:
 | |
| 	    ctype, encoding = webchecker.mimetypes.guess_type(nurl)
 | |
| 	if ctype != 'text/html':
 | |
| 	    webchecker.safeclose(f)
 | |
| 	    if verbose > 1:
 | |
| 		print " Not HTML, mime type", ctype
 | |
| 	    return None
 | |
| 	f.close()
 | |
| 	return webchecker.Page(text, nurl)
 | |
| 
 | |
|     def savefile(self, text, path):
 | |
| 	dir, base = os.path.split(path)
 | |
| 	makedirs(dir)
 | |
| 	f = open(path, "wb")
 | |
| 	f.write(text)
 | |
| 	f.close()
 | |
| 	print "saved", path
 | |
| 
 | |
|     def savefilename(self, url):
 | |
| 	type, rest = urllib.splittype(url)
 | |
| 	host, path = urllib.splithost(rest)
 | |
| 	while path[:1] == "/": path = path[1:]
 | |
| 	user, host = urllib.splituser(host)
 | |
| 	host, port = urllib.splitnport(host)
 | |
| 	host = string.lower(host)
 | |
| 	path = os.path.join(host, path)
 | |
| 	if path[-1] == "/": path = path + "index.html"
 | |
| 	if os.sep != "/":
 | |
| 	    path = string.join(string.split(path, "/"), os.sep)
 | |
| 	return path
 | |
| 
 | |
| def makedirs(dir):
 | |
|     if not dir or os.path.exists(dir):
 | |
| 	return
 | |
|     head, tail = os.path.split(dir)
 | |
|     if not tail:
 | |
| 	print "Huh?  Don't know how to make dir", dir
 | |
| 	return
 | |
|     makedirs(head)
 | |
|     os.mkdir(dir, 0777)
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     sys.exit(main() or 0)
 | 
