1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								"""
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								input, builds a set of rules from that list, then answers questions about
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								fetchability of other URLs.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								"""
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								class RobotFileParser:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def __init__(self):
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        self.rules = {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        self.debug = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        self.url = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        self.last_checked = 0
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def mtime(self):
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return self.last_checked
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def modified(self):
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        import time
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        self.last_checked = time.time()
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def set_url(self, url):
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        self.url = url
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def read(self):
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        import urllib
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        self.parse(urllib.urlopen(self.url).readlines())
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def parse(self, lines):
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        """parse the input lines from a robot.txt file"""
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        import string, re
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        active = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for line in lines:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if self.debug: print '>', line,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            # blank line terminates current record
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not line[:-1]:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                active = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            # remove optional comment and strip line
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            line = string.strip(line[:string.find(line, '#')])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if not line:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                continue
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            line = re.split(' *: *', line)
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if len(line) == 2:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                line[0] = string.lower(line[0])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if line[0] == 'user-agent':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    # this record applies to this user agent
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if self.debug: print '>> user-agent:', line[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    active.append(line[1])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if not self.rules.has_key(line[1]):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        self.rules[line[1]] = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                elif line[0] == 'disallow':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if line[1]:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if self.debug: print '>> disallow:', line[1]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        for agent in active:
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            self.rules[agent].append(re.compile(line[1]))
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        pass
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        for agent in active:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            if self.debug: print '>> allow', agent
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            self.rules[agent] = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    if self.debug: print '>> unknown:', line
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        self.modified()
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # returns true if agent is allowed to fetch url
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    def can_fetch(self, useragent, url):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        """using the parsed robots.txt decide if useragent can fetch url"""
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        import urlparse
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ag = useragent
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if not self.rules.has_key(ag): ag = '*'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not self.rules.has_key(ag):
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if self.debug: print '>> allowing', url, 'fetch by', useragent
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        path = urlparse.urlparse(url)[2]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for rule in self.rules[ag]:
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            if rule.match(path) is not None:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if self.debug: print '>> disallowing', url, 'fetch by', useragent
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                return 0
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if self.debug: print '>> allowing', url, 'fetch by', useragent
							 | 
						
					
						
							
								
									
										
										
										
											1998-04-06 14:29:28 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return 1
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def _test():
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    rp = RobotFileParser()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    rp.debug = 1
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    rp.set_url('http://www.musi-cal.com/robots.txt')
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    rp.read()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    print rp.rules
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    print rp.can_fetch('Musi-Cal-Robot',
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                       'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
							 | 
						
					
						
							
								
									
										
										
										
											1997-01-30 03:18:23 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-27 19:29:31 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								if __name__ == "__main__":
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    _test()
							 |