| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | """ robotparser.py
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Copyright (C) 2000  Bastian Kleineidam | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     You can choose between two licenses when using this package: | 
					
						
							|  |  |  |     1) GNU GPLv2 | 
					
						
							| 
									
										
										
										
											2002-03-18 10:41:20 +00:00
										 |  |  |     2) PSF license for Python 2.2 | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     The robots.txt Exclusion Protocol is implemented as specified in | 
					
						
							|  |  |  |     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | """
 | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  | import urlparse | 
					
						
							|  |  |  | import urllib | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 19:54:20 +00:00
										 |  |  | __all__ = ["RobotFileParser"] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | class RobotFileParser: | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |     """ This class provides a set of methods to read, parse and answer
 | 
					
						
							|  |  |  |     questions about a single robots.txt file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     def __init__(self, url=''): | 
					
						
							|  |  |  |         self.entries = [] | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         self.default_entry = None | 
					
						
							| 
									
										
										
										
											2004-08-23 20:42:35 +00:00
										 |  |  |         self.disallow_all = False | 
					
						
							|  |  |  |         self.allow_all = False | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         self.set_url(url) | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         self.last_checked = 0 | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def mtime(self): | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |         """Returns the time the robots.txt file was last fetched.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         This is useful for long-running web spiders that need to | 
					
						
							|  |  |  |         check for new robots.txt files periodically. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         return self.last_checked | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def modified(self): | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |         """Sets the time the robots.txt file was last fetched to the
 | 
					
						
							|  |  |  |         current time. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         import time | 
					
						
							|  |  |  |         self.last_checked = time.time() | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def set_url(self, url): | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |         """Sets the URL referring to a robots.txt file.""" | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         self.url = url | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         self.host, self.path = urlparse.urlparse(url)[1:3] | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def read(self): | 
					
						
							| 
									
										
										
										
											2002-05-29 16:18:42 +00:00
										 |  |  |         """Reads the robots.txt URL and feeds it to the parser.""" | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         opener = URLopener() | 
					
						
							|  |  |  |         f = opener.open(self.url) | 
					
						
							| 
									
										
										
										
											2002-03-18 10:41:20 +00:00
										 |  |  |         lines = [] | 
					
						
							|  |  |  |         line = f.readline() | 
					
						
							|  |  |  |         while line: | 
					
						
							|  |  |  |             lines.append(line.strip()) | 
					
						
							|  |  |  |             line = f.readline() | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         self.errcode = opener.errcode | 
					
						
							| 
									
										
										
										
											2007-03-13 09:41:31 +00:00
										 |  |  |         if self.errcode in (401, 403): | 
					
						
							| 
									
										
										
										
											2004-08-23 20:42:35 +00:00
										 |  |  |             self.disallow_all = True | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         elif self.errcode >= 400: | 
					
						
							| 
									
										
										
										
											2004-08-23 20:42:35 +00:00
										 |  |  |             self.allow_all = True | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         elif self.errcode == 200 and lines: | 
					
						
							|  |  |  |             self.parse(lines) | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-03-18 10:43:18 +00:00
										 |  |  |     def _add_entry(self, entry): | 
					
						
							|  |  |  |         if "*" in entry.useragents: | 
					
						
							|  |  |  |             # the default entry is considered last | 
					
						
							|  |  |  |             self.default_entry = entry | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             self.entries.append(entry) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  |     def parse(self, lines): | 
					
						
							| 
									
										
										
										
											2004-03-13 20:27:23 +00:00
										 |  |  |         """parse the input lines from a robots.txt file.
 | 
					
						
							| 
									
										
										
										
											2001-01-21 04:49:16 +00:00
										 |  |  |            We allow that a user-agent: line is not preceded by | 
					
						
							|  |  |  |            one or more blank lines."""
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         state = 0 | 
					
						
							|  |  |  |         linenumber = 0 | 
					
						
							|  |  |  |         entry = Entry() | 
					
						
							| 
									
										
										
										
											2001-01-21 04:49:16 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |         for line in lines: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |             linenumber = linenumber + 1 | 
					
						
							|  |  |  |             if not line: | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |                 if state == 1: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                     entry = Entry() | 
					
						
							|  |  |  |                     state = 0 | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |                 elif state == 2: | 
					
						
							| 
									
										
										
										
											2002-03-18 10:43:18 +00:00
										 |  |  |                     self._add_entry(entry) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                     entry = Entry() | 
					
						
							|  |  |  |                     state = 0 | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |             # remove optional comment and strip line | 
					
						
							| 
									
										
										
										
											2001-02-09 08:40:40 +00:00
										 |  |  |             i = line.find('#') | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |             if i >= 0: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                 line = line[:i] | 
					
						
							| 
									
										
										
										
											2001-02-09 08:40:40 +00:00
										 |  |  |             line = line.strip() | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |             if not line: | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2001-02-09 08:40:40 +00:00
										 |  |  |             line = line.split(':', 1) | 
					
						
							| 
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 |  |  |             if len(line) == 2: | 
					
						
							| 
									
										
										
										
											2001-02-09 08:40:40 +00:00
										 |  |  |                 line[0] = line[0].strip().lower() | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |                 line[1] = urllib.unquote(line[1].strip()) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                 if line[0] == "user-agent": | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |                     if state == 2: | 
					
						
							| 
									
										
										
										
											2002-03-18 10:43:18 +00:00
										 |  |  |                         self._add_entry(entry) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                         entry = Entry() | 
					
						
							|  |  |  |                     entry.useragents.append(line[1]) | 
					
						
							|  |  |  |                     state = 1 | 
					
						
							|  |  |  |                 elif line[0] == "disallow": | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |                     if state != 0: | 
					
						
							| 
									
										
										
										
											2004-08-23 20:42:35 +00:00
										 |  |  |                         entry.rulelines.append(RuleLine(line[1], False)) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |                         state = 2 | 
					
						
							|  |  |  |                 elif line[0] == "allow": | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |                     if state != 0: | 
					
						
							| 
									
										
										
										
											2004-08-23 20:42:35 +00:00
										 |  |  |                         entry.rulelines.append(RuleLine(line[1], True)) | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |         if state == 2: | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |             self.entries.append(entry) | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 |  |  |     def can_fetch(self, useragent, url): | 
					
						
							|  |  |  |         """using the parsed robots.txt decide if useragent can fetch url""" | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         if self.disallow_all: | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |             return False | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         if self.allow_all: | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |             return True | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         # search for given user agent matches | 
					
						
							|  |  |  |         # the first match counts | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         for entry in self.entries: | 
					
						
							|  |  |  |             if entry.applies_to(useragent): | 
					
						
							|  |  |  |                 return entry.allowance(url) | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         # try the default entry last | 
					
						
							|  |  |  |         if self.default_entry: | 
					
						
							|  |  |  |             return self.default_entry.allowance(url) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         # agent not found ==> access granted | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |         return True | 
					
						
							| 
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def __str__(self): | 
					
						
							| 
									
										
										
										
											2007-03-13 09:41:31 +00:00
										 |  |  |         return ''.join([str(entry) + "\n" for entry in self.entries]) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class RuleLine: | 
					
						
							| 
									
										
										
										
											2004-08-23 20:42:35 +00:00
										 |  |  |     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
 | 
					
						
							|  |  |  |        (allowance==False) followed by a path."""
 | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |     def __init__(self, path, allowance): | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  |         if path == '' and not allowance: | 
					
						
							|  |  |  |             # an empty value means allow all | 
					
						
							| 
									
										
										
										
											2004-08-23 20:42:35 +00:00
										 |  |  |             allowance = True | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         self.path = urllib.quote(path) | 
					
						
							|  |  |  |         self.allowance = allowance | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def applies_to(self, filename): | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |         return self.path == "*" or filename.startswith(self.path) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def __str__(self): | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |         return (self.allowance and "Allow" or "Disallow") + ": " + self.path | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Entry: | 
					
						
							|  |  |  |     """An entry has one or more user-agents and zero or more rulelines""" | 
					
						
							|  |  |  |     def __init__(self): | 
					
						
							|  |  |  |         self.useragents = [] | 
					
						
							|  |  |  |         self.rulelines = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __str__(self): | 
					
						
							| 
									
										
										
										
											2007-03-13 09:41:31 +00:00
										 |  |  |         ret = [] | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         for agent in self.useragents: | 
					
						
							| 
									
										
										
										
											2007-03-13 09:41:31 +00:00
										 |  |  |             ret.extend(["User-agent: ", agent, "\n"]) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         for line in self.rulelines: | 
					
						
							| 
									
										
										
										
											2007-03-13 09:41:31 +00:00
										 |  |  |             ret.extend([str(line), "\n"]) | 
					
						
							|  |  |  |         return ''.join(ret) | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def applies_to(self, useragent): | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         """check if this entry applies to the specified agent""" | 
					
						
							|  |  |  |         # split the name token and make it lower case | 
					
						
							|  |  |  |         useragent = useragent.split("/")[0].lower() | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  |         for agent in self.useragents: | 
					
						
							| 
									
										
										
										
											2008-04-28 03:27:53 +00:00
										 |  |  |             if agent == '*': | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |                 # we have the catch-all agent | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |                 return True | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |             agent = agent.lower() | 
					
						
							| 
									
										
										
										
											2004-05-04 09:21:43 +00:00
										 |  |  |             if agent in useragent: | 
					
						
							| 
									
										
										
										
											2002-04-04 22:55:58 +00:00
										 |  |  |                 return True | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def allowance(self, filename): | 
					
						
							|  |  |  |         """Preconditions:
 | 
					
						
							|  |  |  |         - our agent applies to this entry | 
					
						
							|  |  |  |         - filename is URL decoded"""
 | 
					
						
							|  |  |  |         for line in self.rulelines: | 
					
						
							|  |  |  |             if line.applies_to(filename): | 
					
						
							|  |  |  |                 return line.allowance | 
					
						
							| 
									
										
										
										
											2004-08-23 20:42:35 +00:00
										 |  |  |         return True | 
					
						
							| 
									
										
										
										
											2001-01-20 15:59:25 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  | class URLopener(urllib.FancyURLopener): | 
					
						
							|  |  |  |     def __init__(self, *args): | 
					
						
							| 
									
										
										
										
											2003-02-27 20:14:51 +00:00
										 |  |  |         urllib.FancyURLopener.__init__(self, *args) | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |         self.errcode = 200 | 
					
						
							| 
									
										
										
										
											2001-02-15 23:56:39 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-28 23:22:52 +00:00
										 |  |  |     def prompt_user_passwd(self, host, realm): | 
					
						
							|  |  |  |         ## If robots.txt file is accessible only with a password, | 
					
						
							|  |  |  |         ## we act as if the file wasn't there. | 
					
						
							|  |  |  |         return None, None | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-02-12 20:58:30 +00:00
										 |  |  |     def http_error_default(self, url, fp, errcode, errmsg, headers): | 
					
						
							|  |  |  |         self.errcode = errcode | 
					
						
							|  |  |  |         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, | 
					
						
							|  |  |  |                                                         errmsg, headers) |