mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	Get rid of _test(), _main(), _debug() and _check(). Tests are no longer
needed (better set available in Lib/test/test_robotparser.py). Clean up a few PEP 8 nits (compound statements on a single line, whitespace around operators).
This commit is contained in:
		
							parent
							
								
									dfd982715b
								
							
						
					
					
						commit
						b8bdbc04e7
					
				
					 1 changed files with 12 additions and 93 deletions
				
			
		|  | @ -9,15 +9,11 @@ | |||
|     The robots.txt Exclusion Protocol is implemented as specified in | ||||
|     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html | ||||
| """ | ||||
| import urlparse,urllib | ||||
| import urlparse | ||||
| import urllib | ||||
| 
 | ||||
| __all__ = ["RobotFileParser"] | ||||
| 
 | ||||
| debug = 0 | ||||
| 
 | ||||
| def _debug(msg): | ||||
|     if debug: print msg | ||||
| 
 | ||||
| 
 | ||||
| class RobotFileParser: | ||||
|     """ This class provides a set of methods to read, parse and answer | ||||
|  | @ -67,12 +63,9 @@ def read(self): | |||
|         self.errcode = opener.errcode | ||||
|         if self.errcode in (401, 403): | ||||
|             self.disallow_all = True | ||||
|             _debug("disallow all") | ||||
|         elif self.errcode >= 400: | ||||
|             self.allow_all = True | ||||
|             _debug("allow all") | ||||
|         elif self.errcode == 200 and lines: | ||||
|             _debug("parse lines") | ||||
|             self.parse(lines) | ||||
| 
 | ||||
|     def _add_entry(self, entry): | ||||
|  | @ -93,19 +86,16 @@ def parse(self, lines): | |||
|         for line in lines: | ||||
|             linenumber = linenumber + 1 | ||||
|             if not line: | ||||
|                 if state==1: | ||||
|                     _debug("line %d: warning: you should insert" | ||||
|                            " allow: or disallow: directives below any" | ||||
|                            " user-agent: line" % linenumber) | ||||
|                 if state == 1: | ||||
|                     entry = Entry() | ||||
|                     state = 0 | ||||
|                 elif state==2: | ||||
|                 elif state == 2: | ||||
|                     self._add_entry(entry) | ||||
|                     entry = Entry() | ||||
|                     state = 0 | ||||
|             # remove optional comment and strip line | ||||
|             i = line.find('#') | ||||
|             if i>=0: | ||||
|             if i >= 0: | ||||
|                 line = line[:i] | ||||
|             line = line.strip() | ||||
|             if not line: | ||||
|  | @ -115,41 +105,24 @@ def parse(self, lines): | |||
|                 line[0] = line[0].strip().lower() | ||||
|                 line[1] = urllib.unquote(line[1].strip()) | ||||
|                 if line[0] == "user-agent": | ||||
|                     if state==2: | ||||
|                         _debug("line %d: warning: you should insert a blank" | ||||
|                                " line before any user-agent" | ||||
|                                " directive" % linenumber) | ||||
|                     if state == 2: | ||||
|                         self._add_entry(entry) | ||||
|                         entry = Entry() | ||||
|                     entry.useragents.append(line[1]) | ||||
|                     state = 1 | ||||
|                 elif line[0] == "disallow": | ||||
|                     if state==0: | ||||
|                         _debug("line %d: error: you must insert a user-agent:" | ||||
|                                " directive before this line" % linenumber) | ||||
|                     else: | ||||
|                     if state != 0: | ||||
|                         entry.rulelines.append(RuleLine(line[1], False)) | ||||
|                         state = 2 | ||||
|                 elif line[0] == "allow": | ||||
|                     if state==0: | ||||
|                         _debug("line %d: error: you must insert a user-agent:" | ||||
|                                " directive before this line" % linenumber) | ||||
|                     else: | ||||
|                     if state != 0: | ||||
|                         entry.rulelines.append(RuleLine(line[1], True)) | ||||
|                 else: | ||||
|                     _debug("line %d: warning: unknown key %s" % (linenumber, | ||||
|                                line[0])) | ||||
|             else: | ||||
|                 _debug("line %d: error: malformed line %s"%(linenumber, line)) | ||||
|         if state==2: | ||||
|         if state == 2: | ||||
|             self.entries.append(entry) | ||||
|         _debug("Parsed rules:\n%s" % str(self)) | ||||
| 
 | ||||
| 
 | ||||
|     def can_fetch(self, useragent, url): | ||||
|         """using the parsed robots.txt decide if useragent can fetch url""" | ||||
|         _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" % | ||||
|                (useragent, url)) | ||||
|         if self.disallow_all: | ||||
|             return False | ||||
|         if self.allow_all: | ||||
|  | @ -182,10 +155,10 @@ def __init__(self, path, allowance): | |||
|         self.allowance = allowance | ||||
| 
 | ||||
|     def applies_to(self, filename): | ||||
|         return self.path=="*" or filename.startswith(self.path) | ||||
|         return self.path == "*" or filename.startswith(self.path) | ||||
| 
 | ||||
|     def __str__(self): | ||||
|         return (self.allowance and "Allow" or "Disallow")+": "+self.path | ||||
|         return (self.allowance and "Allow" or "Disallow") + ": " + self.path | ||||
| 
 | ||||
| 
 | ||||
| class Entry: | ||||
|  | @ -207,7 +180,7 @@ def applies_to(self, useragent): | |||
|         # split the name token and make it lower case | ||||
|         useragent = useragent.split("/")[0].lower() | ||||
|         for agent in self.useragents: | ||||
|             if agent=='*': | ||||
|             if agent == '*': | ||||
|                 # we have the catch-all agent | ||||
|                 return True | ||||
|             agent = agent.lower() | ||||
|  | @ -220,7 +193,6 @@ def allowance(self, filename): | |||
|         - our agent applies to this entry | ||||
|         - filename is URL decoded""" | ||||
|         for line in self.rulelines: | ||||
|             _debug((filename, str(line), line.allowance)) | ||||
|             if line.applies_to(filename): | ||||
|                 return line.allowance | ||||
|         return True | ||||
|  | @ -239,56 +211,3 @@ def http_error_default(self, url, fp, errcode, errmsg, headers): | |||
|         self.errcode = errcode | ||||
|         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, | ||||
|                                                         errmsg, headers) | ||||
| 
 | ||||
| def _check(a,b): | ||||
|     if not b: | ||||
|         ac = "access denied" | ||||
|     else: | ||||
|         ac = "access allowed" | ||||
|     if a!=b: | ||||
|         print "failed" | ||||
|     else: | ||||
|         print "ok (%s)" % ac | ||||
|     print | ||||
| 
 | ||||
| def _test(): | ||||
|     global debug | ||||
|     rp = RobotFileParser() | ||||
|     debug = 1 | ||||
| 
 | ||||
|     # robots.txt that exists, gotten to by redirection | ||||
|     rp.set_url('http://www.musi-cal.com/robots.txt') | ||||
|     rp.read() | ||||
| 
 | ||||
|     # test for re.escape | ||||
|     _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) | ||||
|     # this should match the first rule, which is a disallow | ||||
|     _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) | ||||
|     # various cherry pickers | ||||
|     _check(rp.can_fetch('CherryPickerSE', | ||||
|                        'http://www.musi-cal.com/cgi-bin/event-search' | ||||
|                        '?city=San+Francisco'), 0) | ||||
|     _check(rp.can_fetch('CherryPickerSE/1.0', | ||||
|                        'http://www.musi-cal.com/cgi-bin/event-search' | ||||
|                        '?city=San+Francisco'), 0) | ||||
|     _check(rp.can_fetch('CherryPickerSE/1.5', | ||||
|                        'http://www.musi-cal.com/cgi-bin/event-search' | ||||
|                        '?city=San+Francisco'), 0) | ||||
|     # case sensitivity | ||||
|     _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) | ||||
|     _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) | ||||
|     # substring test | ||||
|     _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) | ||||
|     # tests for catch-all * agent | ||||
|     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) | ||||
|     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) | ||||
|     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) | ||||
|     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) | ||||
| 
 | ||||
|     # robots.txt that does not exist | ||||
|     rp.set_url('http://www.lycos.com/robots.txt') | ||||
|     rp.read() | ||||
|     _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     _test() | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Skip Montanaro
						Skip Montanaro