mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 23:21:29 +00:00 
			
		
		
		
	
		
			
	
	
		
			132 lines
		
	
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			132 lines
		
	
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								#! /usr/bin/env python
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								"""A variant on webchecker that creates a mirror copy of a remote site."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								__version__ = "0.1"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								import os
							 | 
						||
| 
								 | 
							
								import sys
							 | 
						||
| 
								 | 
							
								import string
							 | 
						||
| 
								 | 
							
								import urllib
							 | 
						||
| 
								 | 
							
								import getopt
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								import webchecker
							 | 
						||
| 
								 | 
							
								verbose = webchecker.verbose
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def main():
							 | 
						||
| 
								 | 
							
								    global verbose
							 | 
						||
| 
								 | 
							
								    try:
							 | 
						||
| 
								 | 
							
									opts, args = getopt.getopt(sys.argv[1:], "qv")
							 | 
						||
| 
								 | 
							
								    except getopt.error, msg:
							 | 
						||
| 
								 | 
							
									print msg
							 | 
						||
| 
								 | 
							
									print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
							 | 
						||
| 
								 | 
							
									return 2
							 | 
						||
| 
								 | 
							
								    for o, a in opts:
							 | 
						||
| 
								 | 
							
									if o == "-q":
							 | 
						||
| 
								 | 
							
									    webchecker.verbose = verbose = 0
							 | 
						||
| 
								 | 
							
									if o == "-v":
							 | 
						||
| 
								 | 
							
									    webchecker.verbose = verbose = verbose + 1
							 | 
						||
| 
								 | 
							
								    c = Sucker(0)
							 | 
						||
| 
								 | 
							
								    c.urlopener.addheaders = [
							 | 
						||
| 
								 | 
							
									    ('User-agent', 'websucker/%s' % __version__),
							 | 
						||
| 
								 | 
							
									]
							 | 
						||
| 
								 | 
							
								    for arg in args:
							 | 
						||
| 
								 | 
							
									print "Adding root", arg
							 | 
						||
| 
								 | 
							
									c.addroot(arg)
							 | 
						||
| 
								 | 
							
								    print "Run..."
							 | 
						||
| 
								 | 
							
								    c.run()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class Sucker(webchecker.Checker):
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    # Alas, had to copy this to make one change...
							 | 
						||
| 
								 | 
							
								    def getpage(self, url):
							 | 
						||
| 
								 | 
							
									if url[:7] == 'mailto:' or url[:5] == 'news:':
							 | 
						||
| 
								 | 
							
									    if verbose > 1: print " Not checking mailto/news URL"
							 | 
						||
| 
								 | 
							
									    return None
							 | 
						||
| 
								 | 
							
									isint = self.inroots(url)
							 | 
						||
| 
								 | 
							
									if not isint and not self.checkext:
							 | 
						||
| 
								 | 
							
									    if verbose > 1: print " Not checking ext link"
							 | 
						||
| 
								 | 
							
									    return None
							 | 
						||
| 
								 | 
							
									path = self.savefilename(url)
							 | 
						||
| 
								 | 
							
									saved = 0
							 | 
						||
| 
								 | 
							
									try:
							 | 
						||
| 
								 | 
							
									    f = open(path, "rb")
							 | 
						||
| 
								 | 
							
									except IOError:
							 | 
						||
| 
								 | 
							
									    try:
							 | 
						||
| 
								 | 
							
										f = self.urlopener.open(url)
							 | 
						||
| 
								 | 
							
									    except IOError, msg:
							 | 
						||
| 
								 | 
							
										msg = webchecker.sanitize(msg)
							 | 
						||
| 
								 | 
							
										if verbose > 0:
							 | 
						||
| 
								 | 
							
										    print "Error ", msg
							 | 
						||
| 
								 | 
							
										if verbose > 0:
							 | 
						||
| 
								 | 
							
										    webchecker.show(" HREF ", url, "  from", self.todo[url])
							 | 
						||
| 
								 | 
							
										self.setbad(url, msg)
							 | 
						||
| 
								 | 
							
										return None
							 | 
						||
| 
								 | 
							
									    if not isint:
							 | 
						||
| 
								 | 
							
										if verbose > 1: print " Not gathering links from ext URL"
							 | 
						||
| 
								 | 
							
										safeclose(f)
							 | 
						||
| 
								 | 
							
										return None
							 | 
						||
| 
								 | 
							
									    nurl = f.geturl()
							 | 
						||
| 
								 | 
							
									    if nurl != url:
							 | 
						||
| 
								 | 
							
										path = self.savefilename(nurl)
							 | 
						||
| 
								 | 
							
									    info = f.info()
							 | 
						||
| 
								 | 
							
									else:
							 | 
						||
| 
								 | 
							
									    if verbose: print "Loading cached URL", url
							 | 
						||
| 
								 | 
							
									    saved = 1
							 | 
						||
| 
								 | 
							
									    nurl = url
							 | 
						||
| 
								 | 
							
									    info = {}
							 | 
						||
| 
								 | 
							
									    if url[-1:] == "/":
							 | 
						||
| 
								 | 
							
										info["content-type"] = "text/html"
							 | 
						||
| 
								 | 
							
									text = f.read()
							 | 
						||
| 
								 | 
							
									if not saved: self.savefile(text, path)
							 | 
						||
| 
								 | 
							
									if info.has_key('content-type'):
							 | 
						||
| 
								 | 
							
									    ctype = string.lower(info['content-type'])
							 | 
						||
| 
								 | 
							
									else:
							 | 
						||
| 
								 | 
							
									    ctype = None
							 | 
						||
| 
								 | 
							
									if nurl != url:
							 | 
						||
| 
								 | 
							
									    if verbose > 1:
							 | 
						||
| 
								 | 
							
										print " Redirected to", nurl
							 | 
						||
| 
								 | 
							
									if not ctype:
							 | 
						||
| 
								 | 
							
									    ctype, encoding = webchecker.mimetypes.guess_type(nurl)
							 | 
						||
| 
								 | 
							
									if ctype != 'text/html':
							 | 
						||
| 
								 | 
							
									    webchecker.safeclose(f)
							 | 
						||
| 
								 | 
							
									    if verbose > 1:
							 | 
						||
| 
								 | 
							
										print " Not HTML, mime type", ctype
							 | 
						||
| 
								 | 
							
									    return None
							 | 
						||
| 
								 | 
							
									f.close()
							 | 
						||
| 
								 | 
							
									return webchecker.Page(text, nurl)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def savefile(self, text, path):
							 | 
						||
| 
								 | 
							
									dir, base = os.path.split(path)
							 | 
						||
| 
								 | 
							
									makedirs(dir)
							 | 
						||
| 
								 | 
							
									f = open(path, "wb")
							 | 
						||
| 
								 | 
							
									f.write(text)
							 | 
						||
| 
								 | 
							
									f.close()
							 | 
						||
| 
								 | 
							
									print "saved", path
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def savefilename(self, url):
							 | 
						||
| 
								 | 
							
									type, rest = urllib.splittype(url)
							 | 
						||
| 
								 | 
							
									host, path = urllib.splithost(rest)
							 | 
						||
| 
								 | 
							
									while path[:1] == "/": path = path[1:]
							 | 
						||
| 
								 | 
							
									user, host = urllib.splituser(host)
							 | 
						||
| 
								 | 
							
									host, port = urllib.splitnport(host)
							 | 
						||
| 
								 | 
							
									host = string.lower(host)
							 | 
						||
| 
								 | 
							
									path = os.path.join(host, path)
							 | 
						||
| 
								 | 
							
									if path[-1] == "/": path = path + "index.html"
							 | 
						||
| 
								 | 
							
									if os.sep != "/":
							 | 
						||
| 
								 | 
							
									    path = string.join(string.split(path, "/"), os.sep)
							 | 
						||
| 
								 | 
							
									return path
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def makedirs(dir):
							 | 
						||
| 
								 | 
							
								    if not dir or os.path.exists(dir):
							 | 
						||
| 
								 | 
							
									return
							 | 
						||
| 
								 | 
							
								    head, tail = os.path.split(dir)
							 | 
						||
| 
								 | 
							
								    if not tail:
							 | 
						||
| 
								 | 
							
									print "Huh?  Don't know how to make dir", dir
							 | 
						||
| 
								 | 
							
									return
							 | 
						||
| 
								 | 
							
								    makedirs(head)
							 | 
						||
| 
								 | 
							
								    os.mkdir(dir, 0777)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								if __name__ == '__main__':
							 | 
						||
| 
								 | 
							
								    sys.exit(main() or 0)
							 |