| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | """ robotparser.py
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Copyright (C) 2000  Bastian Kleineidam | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     You can choose between two licenses when using this package: | 
					
						
							|  |  |  |     1) GNU GPLv2 | 
					
						
							| 
									
										
										
										
											2002-03-18 10:41:20 +00:00
										 |  |  |     2) PSF license for Python 2.2 | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     The robots.txt Exclusion Protocol is implemented as specified in | 
					
						
							|  |  |  |     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | """
 | 
					
						
							| 
									
										
										
										
											2002-05-31 14:14:06 +00:00
										 |  |  | import urlparse,urllib | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 19:54:20 +00:00
										 |  |  | __all__ = ["RobotFileParser"] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | debug = 0 | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | def _debug(msg): | 
					
						
							|  |  |  |     if debug: print msg | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class RobotFileParser: | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |     """ This class provides a set of methods to read, parse and answer
 | 
					
						
							|  |  |  |     questions about a single robots.txt file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     def __init__(self, url=''): | 
					
						
							|  |  |  |         self.entries = [] | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         self.default_entry = None | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         self.disallow_all = 0 | 
					
						
							|  |  |  |         self.allow_all = 0 | 
					
						
							|  |  |  |         self.set_url(url) | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         self.last_checked = 0 | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def mtime(self): | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |         """Returns the time the robots.txt file was last fetched.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         This is useful for long-running web spiders that need to | 
					
						
							|  |  |  |         check for new robots.txt files periodically. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         return self.last_checked | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def modified(self): | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |         """Sets the time the robots.txt file was last fetched to the
 | 
					
						
							|  |  |  |         current time. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         import time | 
					
						
							|  |  |  |         self.last_checked = time.time() | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def set_url(self, url): | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |         """Sets the URL referring to a robots.txt file.""" | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         self.url = url | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         self.host, self.path = urlparse.urlparse(url)[1:3] | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def read(self): | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |         """Reads the robots.txt URL and feeds it to the parser.""" | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         opener = URLopener() | 
					
						
							|  |  |  |         f = opener.open(self.url) | 
					
						
							| 
									
										
										
										
											2002-03-18 10:41:20 +00:00
										 |  |  |         lines = [] | 
					
						
							|  |  |  |         line = f.readline() | 
					
						
							|  |  |  |         while line: | 
					
						
							|  |  |  |             lines.append(line.strip()) | 
					
						
							|  |  |  |             line = f.readline() | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         self.errcode = opener.errcode | 
					
						
							|  |  |  |         if self.errcode == 401 or self.errcode == 403: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |             self.disallow_all = 1 | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |             _debug("disallow all") | 
					
						
							|  |  |  |         elif self.errcode >= 400: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |             self.allow_all = 1 | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |             _debug("allow all") | 
					
						
							|  |  |  |         elif self.errcode == 200 and lines: | 
					
						
							|  |  |  |             _debug("parse lines") | 
					
						
							|  |  |  |             self.parse(lines) | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-03-18 10:43:18 +00:00
										 |  |  |     def _add_entry(self, entry): | 
					
						
							|  |  |  |         if "*" in entry.useragents: | 
					
						
							|  |  |  |             # the default entry is considered last | 
					
						
							|  |  |  |             self.default_entry = entry | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             self.entries.append(entry) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  |     def parse(self, lines): | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         """parse the input lines from a robot.txt file.
 | 
					
						
							| 
									
										
										
										
											2001-01-21 04:49:16 +00:00
										 |  |  |            We allow that a user-agent: line is not preceded by | 
					
						
							|  |  |  |            one or more blank lines."""
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         state = 0 | 
					
						
							|  |  |  |         linenumber = 0 | 
					
						
							|  |  |  |         entry = Entry() | 
					
						
							| 
									
										
										
										
											2001-01-21 04:49:16 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         for line in lines: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |             linenumber = linenumber + 1 | 
					
						
							|  |  |  |             if not line: | 
					
						
							|  |  |  |                 if state==1: | 
					
						
							|  |  |  |                     _debug("line %d: warning: you should insert" | 
					
						
							| 
									
										
										
										
											2001-01-21 04:49:16 +00:00
										 |  |  |                            " allow: or disallow: directives below any" | 
					
						
							|  |  |  |                            " user-agent: line" % linenumber) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                     entry = Entry() | 
					
						
							|  |  |  |                     state = 0 | 
					
						
							|  |  |  |                 elif state==2: | 
					
						
							| 
									
										
										
										
											2002-03-18 10:43:18 +00:00
										 |  |  |                     self._add_entry(entry) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                     entry = Entry() | 
					
						
							|  |  |  |                     state = 0 | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |             # remove optional comment and strip line | 
					
						
							| 
									
										
										
										
											2001-02-09 08:40:40 +00:00
										 |  |  |             i = line.find('#') | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |             if i>=0: | 
					
						
							|  |  |  |                 line = line[:i] | 
					
						
							| 
									
										
										
										
											2001-02-09 08:40:40 +00:00
										 |  |  |             line = line.strip() | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |             if not line: | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2001-02-09 08:40:40 +00:00
										 |  |  |             line = line.split(':', 1) | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |             if len(line) == 2: | 
					
						
							| 
									
										
										
										
											2001-02-09 08:40:40 +00:00
										 |  |  |                 line[0] = line[0].strip().lower() | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |                 line[1] = urllib.unquote(line[1].strip()) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                 if line[0] == "user-agent": | 
					
						
							|  |  |  |                     if state==2: | 
					
						
							|  |  |  |                         _debug("line %d: warning: you should insert a blank" | 
					
						
							| 
									
										
										
										
											2001-01-21 04:49:16 +00:00
										 |  |  |                                " line before any user-agent" | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                                " directive" % linenumber) | 
					
						
							| 
									
										
										
										
											2002-03-18 10:43:18 +00:00
										 |  |  |                         self._add_entry(entry) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                         entry = Entry() | 
					
						
							|  |  |  |                     entry.useragents.append(line[1]) | 
					
						
							|  |  |  |                     state = 1 | 
					
						
							|  |  |  |                 elif line[0] == "disallow": | 
					
						
							|  |  |  |                     if state==0: | 
					
						
							|  |  |  |                         _debug("line %d: error: you must insert a user-agent:" | 
					
						
							| 
									
										
										
										
											2001-01-21 04:49:16 +00:00
										 |  |  |                                " directive before this line" % linenumber) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                     else: | 
					
						
							|  |  |  |                         entry.rulelines.append(RuleLine(line[1], 0)) | 
					
						
							|  |  |  |                         state = 2 | 
					
						
							|  |  |  |                 elif line[0] == "allow": | 
					
						
							|  |  |  |                     if state==0: | 
					
						
							|  |  |  |                         _debug("line %d: error: you must insert a user-agent:" | 
					
						
							| 
									
										
										
										
											2001-01-21 04:49:16 +00:00
										 |  |  |                                " directive before this line" % linenumber) | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |                     else: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                         entry.rulelines.append(RuleLine(line[1], 1)) | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |                 else: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                     _debug("line %d: warning: unknown key %s" % (linenumber, | 
					
						
							|  |  |  |                                line[0])) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 _debug("line %d: error: malformed line %s"%(linenumber, line)) | 
					
						
							|  |  |  |         if state==2: | 
					
						
							|  |  |  |             self.entries.append(entry) | 
					
						
							|  |  |  |         _debug("Parsed rules:\n%s" % str(self)) | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 |  |  |     def can_fetch(self, useragent, url): | 
					
						
							|  |  |  |         """using the parsed robots.txt decide if useragent can fetch url""" | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         _debug("Checking robot.txt allowance for:\n  user agent: %s\n  url: %s" % | 
					
						
							|  |  |  |                (useragent, url)) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         if self.disallow_all: | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |             return False | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         if self.allow_all: | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |             return True | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         # search for given user agent matches | 
					
						
							|  |  |  |         # the first match counts | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         for entry in self.entries: | 
					
						
							|  |  |  |             if entry.applies_to(useragent): | 
					
						
							|  |  |  |                 return entry.allowance(url) | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         # try the default entry last | 
					
						
							|  |  |  |         if self.default_entry: | 
					
						
							|  |  |  |             return self.default_entry.allowance(url) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         # agent not found ==> access granted | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |         return True | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def __str__(self): | 
					
						
							|  |  |  |         ret = "" | 
					
						
							|  |  |  |         for entry in self.entries: | 
					
						
							|  |  |  |             ret = ret + str(entry) + "\n" | 
					
						
							|  |  |  |         return ret | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class RuleLine: | 
					
						
							|  |  |  |     """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
 | 
					
						
							|  |  |  |        (allowance==0) followed by a path."""
 | 
					
						
							|  |  |  |     def __init__(self, path, allowance): | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         if path == '' and not allowance: | 
					
						
							|  |  |  |             # an empty value means allow all | 
					
						
							|  |  |  |             allowance = 1 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         self.path = urllib.quote(path) | 
					
						
							|  |  |  |         self.allowance = allowance | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def applies_to(self, filename): | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         return self.path=="*" or filename.startswith(self.path) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def __str__(self): | 
					
						
							|  |  |  |         return (self.allowance and "Allow" or "Disallow")+": "+self.path | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Entry: | 
					
						
							|  |  |  |     """An entry has one or more user-agents and zero or more rulelines""" | 
					
						
							|  |  |  |     def __init__(self): | 
					
						
							|  |  |  |         self.useragents = [] | 
					
						
							|  |  |  |         self.rulelines = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __str__(self): | 
					
						
							|  |  |  |         ret = "" | 
					
						
							|  |  |  |         for agent in self.useragents: | 
					
						
							|  |  |  |             ret = ret + "User-agent: "+agent+"\n" | 
					
						
							|  |  |  |         for line in self.rulelines: | 
					
						
							|  |  |  |             ret = ret + str(line) + "\n" | 
					
						
							|  |  |  |         return ret | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def applies_to(self, useragent): | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         """check if this entry applies to the specified agent""" | 
					
						
							|  |  |  |         # split the name token and make it lower case | 
					
						
							|  |  |  |         useragent = useragent.split("/")[0].lower() | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         for agent in self.useragents: | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |             if agent=='*': | 
					
						
							|  |  |  |                 # we have the catch-all agent | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |                 return True | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |             agent = agent.lower() | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |             if useragent.find(agent) != -1: | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |                 return True | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def allowance(self, filename): | 
					
						
							|  |  |  |         """Preconditions:
 | 
					
						
							|  |  |  |         - our agent applies to this entry | 
					
						
							|  |  |  |         - filename is URL decoded"""
 | 
					
						
							|  |  |  |         for line in self.rulelines: | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |             _debug((filename, str(line), line.allowance)) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |             if line.applies_to(filename): | 
					
						
							|  |  |  |                 return line.allowance | 
					
						
							|  |  |  |         return 1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  | class URLopener(urllib.FancyURLopener): | 
					
						
							|  |  |  |     def __init__(self, *args): | 
					
						
							|  |  |  |         apply(urllib.FancyURLopener.__init__, (self,) + args) | 
					
						
							|  |  |  |         self.errcode = 200 | 
					
						
							| 
									
										
										
										
											2001-02-15 23:56:39 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |     def http_error_default(self, url, fp, errcode, errmsg, headers): | 
					
						
							|  |  |  |         self.errcode = errcode | 
					
						
							|  |  |  |         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, | 
					
						
							|  |  |  |                                                         errmsg, headers) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _check(a,b): | 
					
						
							|  |  |  |     if not b: | 
					
						
							|  |  |  |         ac = "access denied" | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         ac = "access allowed" | 
					
						
							|  |  |  |     if a!=b: | 
					
						
							|  |  |  |         print "failed" | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         print "ok (%s)" % ac | 
					
						
							|  |  |  |     print | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 |  |  | def _test(): | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     global debug | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  |     rp = RobotFileParser() | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     debug = 1 | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # robots.txt that exists, gotten to by redirection | 
					
						
							|  |  |  |     rp.set_url('http://www.musi-cal.com/robots.txt') | 
					
						
							|  |  |  |     rp.read() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # test for re.escape | 
					
						
							|  |  |  |     _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) | 
					
						
							|  |  |  |     # this should match the first rule, which is a disallow | 
					
						
							|  |  |  |     _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) | 
					
						
							|  |  |  |     # various cherry pickers | 
					
						
							|  |  |  |     _check(rp.can_fetch('CherryPickerSE', | 
					
						
							|  |  |  |                        'http://www.musi-cal.com/cgi-bin/event-search' | 
					
						
							|  |  |  |                        '?city=San+Francisco'), 0) | 
					
						
							|  |  |  |     _check(rp.can_fetch('CherryPickerSE/1.0', | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                        'http://www.musi-cal.com/cgi-bin/event-search' | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |                        '?city=San+Francisco'), 0) | 
					
						
							|  |  |  |     _check(rp.can_fetch('CherryPickerSE/1.5', | 
					
						
							|  |  |  |                        'http://www.musi-cal.com/cgi-bin/event-search' | 
					
						
							|  |  |  |                        '?city=San+Francisco'), 0) | 
					
						
							|  |  |  |     # case sensitivity | 
					
						
							|  |  |  |     _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) | 
					
						
							|  |  |  |     _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) | 
					
						
							|  |  |  |     # substring test | 
					
						
							|  |  |  |     _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) | 
					
						
							|  |  |  |     # tests for catch-all * agent | 
					
						
							|  |  |  |     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) | 
					
						
							|  |  |  |     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) | 
					
						
							|  |  |  |     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) | 
					
						
							|  |  |  |     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # robots.txt that does not exist | 
					
						
							|  |  |  |     rp.set_url('http://www.lycos.com/robots.txt') | 
					
						
							|  |  |  |     rp.read() | 
					
						
							|  |  |  |     _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | if __name__ == '__main__': | 
					
						
							| 
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 |  |  |     _test() |