mirror of
				https://github.com/python/cpython.git
				synced 2025-10-25 18:54:53 +00:00 
			
		
		
		
	Issue 21469: Mitigate risk of false positives with robotparser.
* Repair the broken link to norobots-rfc.txt. * HTTP response codes >= 500 treated as a failed read rather than as a not found. Not found means that we can assume the entire site is allowed. A 5xx server error tells us nothing. * A successful read() or parse() updates the mtime (which is defined to be "the time the robots.txt file was last fetched"). * The can_fetch() method returns False unless we've had a read() with a 2xx or 4xx response. This avoids false positives in the case where a user calls can_fetch() before calling read(). * I don't see any easy way to test this patch without hitting internet resources that might change or without use of mock objects that wouldn't provide must reassurance.
This commit is contained in:
		
							parent
							
								
									73308d6869
								
							
						
					
					
						commit
						122541bece
					
				
					 1 changed files with 9 additions and 2 deletions
				
			
		|  | @ -7,7 +7,7 @@ | |||
|     2) PSF license for Python 2.2 | ||||
| 
 | ||||
|     The robots.txt Exclusion Protocol is implemented as specified in | ||||
|     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html | ||||
|     http://www.robotstxt.org/norobots-rfc.txt | ||||
| """ | ||||
| 
 | ||||
| import urllib.parse, urllib.request | ||||
|  | @ -57,7 +57,7 @@ def read(self): | |||
|         except urllib.error.HTTPError as err: | ||||
|             if err.code in (401, 403): | ||||
|                 self.disallow_all = True | ||||
|             elif err.code >= 400: | ||||
|             elif err.code >= 400 and err.code < 500: | ||||
|                 self.allow_all = True | ||||
|         else: | ||||
|             raw = f.read() | ||||
|  | @ -85,6 +85,7 @@ def parse(self, lines): | |||
|         state = 0 | ||||
|         entry = Entry() | ||||
| 
 | ||||
|         self.modified() | ||||
|         for line in lines: | ||||
|             if not line: | ||||
|                 if state == 1: | ||||
|  | @ -129,6 +130,12 @@ def can_fetch(self, useragent, url): | |||
|             return False | ||||
|         if self.allow_all: | ||||
|             return True | ||||
|         # Until the robots.txt file has been read or found not | ||||
|         # to exist, we must assume that no url is allowable. | ||||
|         # This prevents false positives when a user erronenously | ||||
|         # calls can_fetch() before calling read(). | ||||
|         if not self.last_checked: | ||||
|             return False | ||||
|         # search for given user agent matches | ||||
|         # the first match counts | ||||
|         parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Raymond Hettinger
						Raymond Hettinger