mirror of
				https://github.com/python/cpython.git
				synced 2025-11-01 06:01:29 +00:00 
			
		
		
		
	Issue #25400: RobotFileParser now correctly returns default values for crawl_delay and request_rate
Initial patch by Peter Wirtz.
This commit is contained in:
		
							parent
							
								
									85c98bf968
								
							
						
					
					
						commit
						9a7bbb2e3f
					
				
					 3 changed files with 46 additions and 21 deletions
				
			
		|  | @ -79,7 +79,28 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): | ||||||
|     bad = ['/cyberworld/map/index.html', '/', '/tmp/'] |     bad = ['/cyberworld/map/index.html', '/', '/tmp/'] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase): | class BaseRequestRateTest(BaseRobotTest): | ||||||
|  | 
 | ||||||
|  |     def test_request_rate(self): | ||||||
|  |         for url in self.good + self.bad: | ||||||
|  |             agent, url = self.get_agent_and_url(url) | ||||||
|  |             with self.subTest(url=url, agent=agent): | ||||||
|  |                 if self.crawl_delay: | ||||||
|  |                     self.assertEqual( | ||||||
|  |                         self.parser.crawl_delay(agent), self.crawl_delay | ||||||
|  |                     ) | ||||||
|  |                 if self.request_rate: | ||||||
|  |                     self.assertEqual( | ||||||
|  |                         self.parser.request_rate(agent).requests, | ||||||
|  |                         self.request_rate.requests | ||||||
|  |                     ) | ||||||
|  |                     self.assertEqual( | ||||||
|  |                         self.parser.request_rate(agent).seconds, | ||||||
|  |                         self.request_rate.seconds | ||||||
|  |                     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): | ||||||
|     robots_txt = """\ |     robots_txt = """\ | ||||||
| User-agent: figtree | User-agent: figtree | ||||||
| Crawl-delay: 3 | Crawl-delay: 3 | ||||||
|  | @ -96,24 +117,6 @@ class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase): | ||||||
|     bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', |     bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', | ||||||
|            '/a%2fb.html', '/~joe/index.html'] |            '/a%2fb.html', '/~joe/index.html'] | ||||||
| 
 | 
 | ||||||
|     def test_request_rate(self): |  | ||||||
|         for url in self.good: |  | ||||||
|             agent, url = self.get_agent_and_url(url) |  | ||||||
|             with self.subTest(url=url, agent=agent): |  | ||||||
|                 if self.crawl_delay: |  | ||||||
|                     self.assertEqual( |  | ||||||
|                         self.parser.crawl_delay(agent), self.crawl_delay |  | ||||||
|                     ) |  | ||||||
|                 if self.request_rate and self.parser.request_rate(agent): |  | ||||||
|                     self.assertEqual( |  | ||||||
|                         self.parser.request_rate(agent).requests, |  | ||||||
|                         self.request_rate.requests |  | ||||||
|                     ) |  | ||||||
|                     self.assertEqual( |  | ||||||
|                         self.parser.request_rate(agent).seconds, |  | ||||||
|                         self.request_rate.seconds |  | ||||||
|                     ) |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| class DifferentAgentTest(CrawlDelayAndRequestRateTest): | class DifferentAgentTest(CrawlDelayAndRequestRateTest): | ||||||
|     agent = 'FigTree Robot libwww-perl/5.04' |     agent = 'FigTree Robot libwww-perl/5.04' | ||||||
|  | @ -230,6 +233,19 @@ class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): | ||||||
|     bad = ['/another/path?'] |     bad = ['/another/path?'] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): | ||||||
|  |     robots_txt = """\ | ||||||
|  | User-agent: * | ||||||
|  | Crawl-delay: 1 | ||||||
|  | Request-rate: 3/15 | ||||||
|  | Disallow: /cyberworld/map/ | ||||||
|  |     """ | ||||||
|  |     request_rate = namedtuple('req_rate', 'requests seconds')(3, 15) | ||||||
|  |     crawl_delay = 1 | ||||||
|  |     good = ['/', '/test.html'] | ||||||
|  |     bad = ['/cyberworld/map/index.html'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class RobotHandler(BaseHTTPRequestHandler): | class RobotHandler(BaseHTTPRequestHandler): | ||||||
| 
 | 
 | ||||||
|     def do_GET(self): |     def do_GET(self): | ||||||
|  | @ -309,6 +325,8 @@ def test_read_404(self): | ||||||
|         self.assertTrue(parser.allow_all) |         self.assertTrue(parser.allow_all) | ||||||
|         self.assertFalse(parser.disallow_all) |         self.assertFalse(parser.disallow_all) | ||||||
|         self.assertEqual(parser.mtime(), 0) |         self.assertEqual(parser.mtime(), 0) | ||||||
|  |         self.assertIsNone(parser.crawl_delay('*')) | ||||||
|  |         self.assertIsNone(parser.request_rate('*')) | ||||||
| 
 | 
 | ||||||
| if __name__=='__main__': | if __name__=='__main__': | ||||||
|     unittest.main() |     unittest.main() | ||||||
|  |  | ||||||
|  | @ -175,16 +175,20 @@ def can_fetch(self, useragent, url): | ||||||
|         return True |         return True | ||||||
| 
 | 
 | ||||||
|     def crawl_delay(self, useragent): |     def crawl_delay(self, useragent): | ||||||
|  |         if not self.mtime(): | ||||||
|  |             return None | ||||||
|         for entry in self.entries: |         for entry in self.entries: | ||||||
|             if entry.applies_to(useragent): |             if entry.applies_to(useragent): | ||||||
|                 return entry.delay |                 return entry.delay | ||||||
|         return None |         return self.default_entry.delay | ||||||
| 
 | 
 | ||||||
|     def request_rate(self, useragent): |     def request_rate(self, useragent): | ||||||
|  |         if not self.mtime(): | ||||||
|  |             return None | ||||||
|         for entry in self.entries: |         for entry in self.entries: | ||||||
|             if entry.applies_to(useragent): |             if entry.applies_to(useragent): | ||||||
|                 return entry.req_rate |                 return entry.req_rate | ||||||
|         return None |         return self.default_entry.req_rate | ||||||
| 
 | 
 | ||||||
|     def __str__(self): |     def __str__(self): | ||||||
|         return ''.join([str(entry) + "\n" for entry in self.entries]) |         return ''.join([str(entry) + "\n" for entry in self.entries]) | ||||||
|  |  | ||||||
|  | @ -29,6 +29,9 @@ Core and Builtins | ||||||
| Library | Library | ||||||
| ------- | ------- | ||||||
| 
 | 
 | ||||||
|  | - Issue #25400: RobotFileParser now correctly returns default values for | ||||||
|  |   crawl_delay and request_rate.  Initial patch by Peter Wirtz. | ||||||
|  | 
 | ||||||
| - Issue #27932: Prevent memory leak in win32_ver(). | - Issue #27932: Prevent memory leak in win32_ver(). | ||||||
| 
 | 
 | ||||||
| - Fix UnboundLocalError in socket._sendfile_use_sendfile. | - Fix UnboundLocalError in socket._sendfile_use_sendfile. | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Berker Peksag
						Berker Peksag