| 
									
										
										
										
											2007-08-09 01:03:29 +00:00
										 |  |  | import io | 
					
						
							| 
									
										
										
										
											2016-09-18 11:21:57 +03:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2017-09-07 18:56:24 +02:00
										 |  |  | import threading | 
					
						
							| 
									
										
										
										
											2008-06-18 20:49:58 +00:00
										 |  |  | import unittest | 
					
						
							|  |  |  | import urllib.robotparser | 
					
						
							| 
									
										
										
										
											2008-05-20 21:35:26 +00:00
										 |  |  | from test import support | 
					
						
							| 
									
										
										
										
											2020-04-25 10:06:29 +03:00
										 |  |  | from test.support import socket_helper | 
					
						
							| 
									
										
										
										
											2020-05-28 06:10:27 +08:00
										 |  |  | from test.support import threading_helper | 
					
						
							| 
									
										
										
										
											2014-06-25 02:58:15 -07:00
										 |  |  | from http.server import BaseHTTPRequestHandler, HTTPServer | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class BaseRobotTest: | 
					
						
							|  |  |  |     robots_txt = '' | 
					
						
							|  |  |  |     agent = 'test_robotparser' | 
					
						
							|  |  |  |     good = [] | 
					
						
							|  |  |  |     bad = [] | 
					
						
							| 
									
										
										
										
											2018-05-16 07:52:07 -07:00
										 |  |  |     site_maps = None | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def setUp(self): | 
					
						
							|  |  |  |         lines = io.StringIO(self.robots_txt).readlines() | 
					
						
							|  |  |  |         self.parser = urllib.robotparser.RobotFileParser() | 
					
						
							|  |  |  |         self.parser.parse(lines) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def get_agent_and_url(self, url): | 
					
						
							|  |  |  |         if isinstance(url, tuple): | 
					
						
							|  |  |  |             agent, url = url | 
					
						
							|  |  |  |             return agent, url | 
					
						
							|  |  |  |         return self.agent, url | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def test_good_urls(self): | 
					
						
							|  |  |  |         for url in self.good: | 
					
						
							|  |  |  |             agent, url = self.get_agent_and_url(url) | 
					
						
							|  |  |  |             with self.subTest(url=url, agent=agent): | 
					
						
							|  |  |  |                 self.assertTrue(self.parser.can_fetch(agent, url)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def test_bad_urls(self): | 
					
						
							|  |  |  |         for url in self.bad: | 
					
						
							|  |  |  |             agent, url = self.get_agent_and_url(url) | 
					
						
							|  |  |  |             with self.subTest(url=url, agent=agent): | 
					
						
							|  |  |  |                 self.assertFalse(self.parser.can_fetch(agent, url)) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-05-16 07:52:07 -07:00
										 |  |  |     def test_site_maps(self): | 
					
						
							|  |  |  |         self.assertEqual(self.parser.site_maps(), self.site_maps) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | 
 | 
					
						
							|  |  |  | class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | User-agent: * | 
					
						
							|  |  |  | Disallow: /cyberworld/map/ # This is an infinite virtual URL space | 
					
						
							|  |  |  | Disallow: /tmp/ # these will soon disappear | 
					
						
							|  |  |  | Disallow: /foo.html | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     good = ['/', '/test.html'] | 
					
						
							|  |  |  |     bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | # robots.txt for http://www.example.com/ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | User-agent: * | 
					
						
							| 
									
										
										
										
											2015-10-08 12:27:06 +03:00
										 |  |  | Crawl-delay: 1 | 
					
						
							|  |  |  | Request-rate: 3/15 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | Disallow: /cyberworld/map/ # This is an infinite virtual URL space | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Cybermapper knows where to go. | 
					
						
							|  |  |  | User-agent: cybermapper | 
					
						
							|  |  |  | Disallow: | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')] | 
					
						
							|  |  |  |     bad = ['/cyberworld/map/index.html'] | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-05-16 07:52:07 -07:00
										 |  |  | class SitemapTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							|  |  |  | # robots.txt for http://www.example.com/ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | User-agent: * | 
					
						
							|  |  |  | Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml | 
					
						
							|  |  |  | Sitemap: http://www.google.com/hostednews/sitemap_index.xml | 
					
						
							|  |  |  | Request-rate: 3/15 | 
					
						
							|  |  |  | Disallow: /cyberworld/map/ # This is an infinite virtual URL space | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     good = ['/', '/test.html'] | 
					
						
							|  |  |  |     bad = ['/cyberworld/map/index.html'] | 
					
						
							|  |  |  |     site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml', | 
					
						
							|  |  |  |                  'http://www.google.com/hostednews/sitemap_index.xml'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | # go away | 
					
						
							|  |  |  | User-agent: * | 
					
						
							|  |  |  | Disallow: / | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     good = [] | 
					
						
							|  |  |  |     bad = ['/cyberworld/map/index.html', '/', '/tmp/'] | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-18 20:17:58 +03:00
										 |  |  | class BaseRequestRateTest(BaseRobotTest): | 
					
						
							| 
									
										
										
										
											2019-06-16 08:48:57 +02:00
										 |  |  |     request_rate = None | 
					
						
							|  |  |  |     crawl_delay = None | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def test_request_rate(self): | 
					
						
							| 
									
										
										
										
											2019-06-16 08:48:57 +02:00
										 |  |  |         parser = self.parser | 
					
						
							| 
									
										
										
										
											2016-09-18 20:17:58 +03:00
										 |  |  |         for url in self.good + self.bad: | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |             agent, url = self.get_agent_and_url(url) | 
					
						
							|  |  |  |             with self.subTest(url=url, agent=agent): | 
					
						
							| 
									
										
										
										
											2019-06-16 08:48:57 +02:00
										 |  |  |                 self.assertEqual(parser.crawl_delay(agent), self.crawl_delay) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 parsed_request_rate = parser.request_rate(agent) | 
					
						
							|  |  |  |                 self.assertEqual(parsed_request_rate, self.request_rate) | 
					
						
							|  |  |  |                 if self.request_rate is not None: | 
					
						
							| 
									
										
										
										
											2017-11-24 02:40:26 +03:00
										 |  |  |                     self.assertIsInstance( | 
					
						
							| 
									
										
										
										
											2019-06-16 08:48:57 +02:00
										 |  |  |                         parsed_request_rate, | 
					
						
							| 
									
										
										
										
											2017-11-24 02:40:26 +03:00
										 |  |  |                         urllib.robotparser.RequestRate | 
					
						
							|  |  |  |                     ) | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |                     self.assertEqual( | 
					
						
							| 
									
										
										
										
											2019-06-16 08:48:57 +02:00
										 |  |  |                         parsed_request_rate.requests, | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |                         self.request_rate.requests | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  |                     self.assertEqual( | 
					
						
							| 
									
										
										
										
											2019-06-16 08:48:57 +02:00
										 |  |  |                         parsed_request_rate.seconds, | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |                         self.request_rate.seconds | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-16 08:48:57 +02:00
										 |  |  | class EmptyFileTest(BaseRequestRateTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = '' | 
					
						
							|  |  |  |     good = ['/foo'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-18 20:17:58 +03:00
										 |  |  | class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							|  |  |  | User-agent: figtree | 
					
						
							|  |  |  | Crawl-delay: 3 | 
					
						
							|  |  |  | Request-rate: 9/30 | 
					
						
							|  |  |  | Disallow: /tmp | 
					
						
							|  |  |  | Disallow: /a%3cd.html | 
					
						
							|  |  |  | Disallow: /a%2fb.html | 
					
						
							|  |  |  | Disallow: /%7ejoe/index.html | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     agent = 'figtree' | 
					
						
							| 
									
										
										
										
											2017-11-24 02:40:26 +03:00
										 |  |  |     request_rate = urllib.robotparser.RequestRate(9, 30) | 
					
						
							| 
									
										
										
										
											2016-09-18 20:17:58 +03:00
										 |  |  |     crawl_delay = 3 | 
					
						
							|  |  |  |     good = [('figtree', '/foo.html')] | 
					
						
							|  |  |  |     bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', | 
					
						
							|  |  |  |            '/a%2fb.html', '/~joe/index.html'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class DifferentAgentTest(CrawlDelayAndRequestRateTest): | 
					
						
							|  |  |  |     agent = 'FigTree Robot libwww-perl/5.04' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | User-agent: * | 
					
						
							|  |  |  | Disallow: /tmp/ | 
					
						
							|  |  |  | Disallow: /a%3Cd.html | 
					
						
							|  |  |  | Disallow: /a/b.html | 
					
						
							|  |  |  | Disallow: /%7ejoe/index.html | 
					
						
							| 
									
										
										
										
											2015-10-08 12:27:06 +03:00
										 |  |  | Crawl-delay: 3 | 
					
						
							|  |  |  | Request-rate: 9/banana | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     good = ['/tmp'] | 
					
						
							|  |  |  |     bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html', | 
					
						
							|  |  |  |            '/%7Ejoe/index.html'] | 
					
						
							|  |  |  |     crawl_delay = 3 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     # From bug report #523041 | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | User-Agent: * | 
					
						
							|  |  |  | Disallow: /. | 
					
						
							| 
									
										
										
										
											2015-10-08 12:27:06 +03:00
										 |  |  | Crawl-delay: pears | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     good = ['/foo.html'] | 
					
						
							|  |  |  |     # bug report says "/" should be denied, but that is not in the RFC | 
					
						
							|  |  |  |     bad = [] | 
					
						
							| 
									
										
										
										
											2015-10-08 12:27:06 +03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     # also test that Allow and Diasallow works well with each other | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
											  
											
												Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line
  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line
  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line
  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line
  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line
  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line
  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line
  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines
  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line
  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line
  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line
  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line
  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line
  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line
  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines
  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines
  note robotparser bug fix.
........
											
										 
											2008-07-31 16:23:04 +00:00
										 |  |  | User-agent: Googlebot | 
					
						
							|  |  |  | Allow: /folder1/myfile.html | 
					
						
							|  |  |  | Disallow: /folder1/ | 
					
						
							| 
									
										
										
										
											2015-10-08 12:27:06 +03:00
										 |  |  | Request-rate: whale/banana | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     agent = 'Googlebot' | 
					
						
							|  |  |  |     good = ['/folder1/myfile.html'] | 
					
						
							|  |  |  |     bad = ['/folder1/anotherfile.html'] | 
					
						
							| 
									
										
											  
											
												Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line
  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line
  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line
  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line
  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line
  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line
  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line
  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines
  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line
  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line
  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line
  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line
  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line
  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line
  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines
  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines
  note robotparser bug fix.
........
											
										 
											2008-07-31 16:23:04 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     # the order of User-agent should be correct. note | 
					
						
							|  |  |  |     # that this file is incorrect because "Googlebot" is a | 
					
						
							|  |  |  |     # substring of "Googlebot-Mobile" | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
											  
											
												Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line
  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line
  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line
  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line
  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line
  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line
  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line
  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines
  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line
  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line
  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line
  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line
  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line
  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line
  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines
  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines
  note robotparser bug fix.
........
											
										 
											2008-07-31 16:23:04 +00:00
										 |  |  | User-agent: Googlebot | 
					
						
							|  |  |  | Disallow: / | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | User-agent: Googlebot-Mobile | 
					
						
							|  |  |  | Allow: / | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     agent = 'Googlebot' | 
					
						
							|  |  |  |     bad = ['/something.jpg'] | 
					
						
							| 
									
										
											  
											
												Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line
  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line
  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line
  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line
  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line
  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line
  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line
  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines
  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line
  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line
  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line
  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line
  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line
  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line
  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines
  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines
  note robotparser bug fix.
........
											
										 
											2008-07-31 16:23:04 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class UserAgentGoogleMobileTest(UserAgentOrderingTest): | 
					
						
							|  |  |  |     agent = 'Googlebot-Mobile' | 
					
						
							| 
									
										
											  
											
												Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line
  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line
  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line
  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line
  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line
  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line
  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line
  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines
  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line
  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line
  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line
  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line
  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line
  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line
  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines
  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines
  note robotparser bug fix.
........
											
										 
											2008-07-31 16:23:04 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     # Google also got the order wrong. You need | 
					
						
							|  |  |  |     # to specify the URLs from more specific to more general | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
											  
											
												Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line
  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line
  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line
  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line
  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line
  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line
  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line
  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines
  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line
  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line
  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line
  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line
  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line
  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line
  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines
  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines
  note robotparser bug fix.
........
											
										 
											2008-07-31 16:23:04 +00:00
										 |  |  | User-agent: Googlebot | 
					
						
							|  |  |  | Allow: /folder1/myfile.html | 
					
						
							|  |  |  | Disallow: /folder1/ | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     agent = 'googlebot' | 
					
						
							|  |  |  |     good = ['/folder1/myfile.html'] | 
					
						
							|  |  |  |     bad = ['/folder1/anotherfile.html'] | 
					
						
							| 
									
										
											  
											
												Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line
  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line
  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line
  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line
  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line
  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line
  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line
  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines
  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line
  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line
  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line
  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line
  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line
  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line
  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines
  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines
  note robotparser bug fix.
........
											
										 
											2008-07-31 16:23:04 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     # see issue #6325 for details | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
										
										
											2010-07-28 16:27:56 +00:00
										 |  |  | User-agent: * | 
					
						
							|  |  |  | Disallow: /some/path?name=value | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     good = ['/some/path'] | 
					
						
							|  |  |  |     bad = ['/some/path?name=value'] | 
					
						
							| 
									
										
										
										
											2010-07-28 16:27:56 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     # obey first * entry (#4108) | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
										
										
											2010-07-29 17:55:01 +00:00
										 |  |  | User-agent: * | 
					
						
							|  |  |  | Disallow: /some/path | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | User-agent: * | 
					
						
							|  |  |  | Disallow: /another/path | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     good = ['/another/path'] | 
					
						
							|  |  |  |     bad = ['/some/path'] | 
					
						
							| 
									
										
										
										
											2010-07-29 17:55:01 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  | class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     # normalize the URL first (#17403) | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							| 
									
										
										
										
											2013-05-29 05:54:31 -07:00
										 |  |  | User-agent: * | 
					
						
							|  |  |  | Allow: /some/path? | 
					
						
							|  |  |  | Disallow: /another/path? | 
					
						
							| 
									
										
										
										
											2016-09-11 14:53:16 +03:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     good = ['/some/path?'] | 
					
						
							|  |  |  |     bad = ['/another/path?'] | 
					
						
							| 
									
										
										
										
											2013-05-29 05:54:31 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line
  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line
  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line
  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line
  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line
  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line
  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line
  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines
  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line
  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line
  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line
  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line
  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line
  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line
  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines
  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines
  note robotparser bug fix.
........
											
										 
											2008-07-31 16:23:04 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-18 20:17:58 +03:00
										 |  |  | class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							|  |  |  | User-agent: * | 
					
						
							|  |  |  | Crawl-delay: 1 | 
					
						
							|  |  |  | Request-rate: 3/15 | 
					
						
							|  |  |  | Disallow: /cyberworld/map/ | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2017-11-24 02:40:26 +03:00
										 |  |  |     request_rate = urllib.robotparser.RequestRate(3, 15) | 
					
						
							| 
									
										
										
										
											2016-09-18 20:17:58 +03:00
										 |  |  |     crawl_delay = 1 | 
					
						
							|  |  |  |     good = ['/', '/test.html'] | 
					
						
							|  |  |  |     bad = ['/cyberworld/map/index.html'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-05-14 10:10:41 -04:00
										 |  |  | class StringFormattingTest(BaseRobotTest, unittest.TestCase): | 
					
						
							|  |  |  |     robots_txt = """\
 | 
					
						
							|  |  |  | User-agent: * | 
					
						
							|  |  |  | Crawl-delay: 1 | 
					
						
							|  |  |  | Request-rate: 3/15 | 
					
						
							|  |  |  | Disallow: /cyberworld/map/ # This is an infinite virtual URL space | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Cybermapper knows where to go. | 
					
						
							|  |  |  | User-agent: cybermapper | 
					
						
							|  |  |  | Disallow: /some/path | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     expected_output = """\
 | 
					
						
							|  |  |  | User-agent: cybermapper | 
					
						
							|  |  |  | Disallow: /some/path | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | User-agent: * | 
					
						
							|  |  |  | Crawl-delay: 1 | 
					
						
							|  |  |  | Request-rate: 3/15 | 
					
						
							|  |  |  | Disallow: /cyberworld/map/\ | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def test_string_formatting(self): | 
					
						
							|  |  |  |         self.assertEqual(str(self.parser), self.expected_output) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-25 02:58:15 -07:00
										 |  |  | class RobotHandler(BaseHTTPRequestHandler): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def do_GET(self): | 
					
						
							|  |  |  |         self.send_error(403, "Forbidden access") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def log_message(self, format, *args): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-16 16:02:37 +02:00
										 |  |  | @unittest.skipUnless( | 
					
						
							|  |  |  |     support.has_socket_support, | 
					
						
							|  |  |  |     "Socket server requires working socket." | 
					
						
							| 
									
										
										
										
											2022-03-22 12:04:36 +02:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2014-06-25 02:58:15 -07:00
										 |  |  | class PasswordProtectedSiteTestCase(unittest.TestCase): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def setUp(self): | 
					
						
							| 
									
										
										
										
											2019-07-02 14:50:19 +02:00
										 |  |  |         # clear _opener global variable | 
					
						
							|  |  |  |         self.addCleanup(urllib.request.urlcleanup) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-04-25 10:06:29 +03:00
										 |  |  |         self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler) | 
					
						
							| 
									
										
										
										
											2014-06-25 02:58:15 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  |         self.t = threading.Thread( | 
					
						
							|  |  |  |             name='HTTPServer serving', | 
					
						
							|  |  |  |             target=self.server.serve_forever, | 
					
						
							|  |  |  |             # Short poll interval to make the test finish quickly. | 
					
						
							|  |  |  |             # Time between requests is short enough that we won't wake | 
					
						
							|  |  |  |             # up spuriously too many times. | 
					
						
							|  |  |  |             kwargs={'poll_interval':0.01}) | 
					
						
							|  |  |  |         self.t.daemon = True  # In case this function raises. | 
					
						
							|  |  |  |         self.t.start() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def tearDown(self): | 
					
						
							|  |  |  |         self.server.shutdown() | 
					
						
							|  |  |  |         self.t.join() | 
					
						
							|  |  |  |         self.server.server_close() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-28 06:10:27 +08:00
										 |  |  |     @threading_helper.reap_threads | 
					
						
							| 
									
										
										
										
											2008-07-18 20:59:44 +00:00
										 |  |  |     def testPasswordProtectedSite(self): | 
					
						
							| 
									
										
										
										
											2014-06-25 02:58:15 -07:00
										 |  |  |         addr = self.server.server_address | 
					
						
							| 
									
										
										
										
											2020-04-25 10:06:29 +03:00
										 |  |  |         url = 'http://' + socket_helper.HOST + ':' + str(addr[1]) | 
					
						
							| 
									
										
										
										
											2014-06-25 02:58:15 -07:00
										 |  |  |         robots_url = url + "/robots.txt" | 
					
						
							|  |  |  |         parser = urllib.robotparser.RobotFileParser() | 
					
						
							|  |  |  |         parser.set_url(url) | 
					
						
							|  |  |  |         parser.read() | 
					
						
							|  |  |  |         self.assertFalse(parser.can_fetch("*", robots_url)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-06 12:08:36 +02:00
										 |  |  | @support.requires_working_socket() | 
					
						
							| 
									
										
										
										
											2014-06-25 02:58:15 -07:00
										 |  |  | class NetworkTestCase(unittest.TestCase): | 
					
						
							| 
									
										
											  
											
												Merged revisions 57620-57771 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r57771 | thomas.wouters | 2007-08-30 23:54:39 +0200 (Thu, 30 Aug 2007) | 5 lines
  Don't lie in __all__ attributes when SSL is not available: only add the SSL
  classes when they are actually created.
........
  r57620 | walter.doerwald | 2007-08-28 18:38:26 +0200 (Tue, 28 Aug 2007) | 5 lines
  Fix title endtag in HTMLCalender.formatyearpage(). Fix documentation for
  HTMLCalender.formatyearpage() (there's no themonth parameter).
  This fixes issue1046.
........
  r57622 | georg.brandl | 2007-08-28 20:54:44 +0200 (Tue, 28 Aug 2007) | 2 lines
  Add a crasher for the thread-unsafety of file objects.
........
  r57626 | skip.montanaro | 2007-08-29 01:22:52 +0200 (Wed, 29 Aug 2007) | 1 line
  fixes 813986
........
  r57628 | walter.doerwald | 2007-08-29 01:35:33 +0200 (Wed, 29 Aug 2007) | 2 lines
  Fix test output.
........
  r57631 | skip.montanaro | 2007-08-29 03:24:11 +0200 (Wed, 29 Aug 2007) | 2 lines
  Install pygettext (once the scriptsinstall target is working again).
........
  r57633 | skip.montanaro | 2007-08-29 03:33:45 +0200 (Wed, 29 Aug 2007) | 2 lines
  Recent items.
........
  r57650 | neal.norwitz | 2007-08-29 08:15:33 +0200 (Wed, 29 Aug 2007) | 1 line
  Add Bill as a developer
........
  r57651 | facundo.batista | 2007-08-29 12:28:28 +0200 (Wed, 29 Aug 2007) | 5 lines
  Ignore test failures caused by 'resource temporarily unavailable'
  exceptions raised during FailingServerTestCase tests.
  [GSoC - Alan McIntyre]
........
  r57680 | bill.janssen | 2007-08-30 00:35:05 +0200 (Thu, 30 Aug 2007) | 17 lines
  This contains a number of things:
  1) Improve the documentation of the SSL module, with a fuller
     explanation of certificate usage, another reference, proper
     formatting of this and that.
  2) Fix Windows bug in ssl.py, and general bug in sslsocket.close().
     Remove some unused code from ssl.py.  Allow accept() to be called on
     sslsocket sockets.
  3) Use try-except-else in import of ssl in socket.py.  Deprecate use of
     socket.ssl().
  4) Remove use of socket.ssl() in every library module, except for
     test_socket_ssl.py and test_ssl.py.
........
  r57714 | georg.brandl | 2007-08-30 12:09:42 +0200 (Thu, 30 Aug 2007) | 2 lines
  Stronger urge to convert filenames to str before using them as argument to ZipFile.write().
........
  r57716 | georg.brandl | 2007-08-30 12:38:56 +0200 (Thu, 30 Aug 2007) | 2 lines
  Patch #1680959: add test suite for pipes module.
........
  r57717 | georg.brandl | 2007-08-30 14:32:23 +0200 (Thu, 30 Aug 2007) | 3 lines
  * Skip test_pipes on non-POSIX.
  * Don't raise TestSkipped within a test function.
........
  r57723 | mark.summerfield | 2007-08-30 17:03:03 +0200 (Thu, 30 Aug 2007) | 3 lines
  Added more cross-references.
........
  r57726 | walter.doerwald | 2007-08-30 17:30:09 +0200 (Thu, 30 Aug 2007) | 2 lines
  Rewrap line.
........
  r57727 | walter.doerwald | 2007-08-30 17:34:55 +0200 (Thu, 30 Aug 2007) | 2 lines
  Set startinpos before calling the error handler.
........
  r57730 | bill.janssen | 2007-08-30 19:07:28 +0200 (Thu, 30 Aug 2007) | 3 lines
  Added docstrings to methods and functions.
........
  r57743 | bill.janssen | 2007-08-30 20:08:06 +0200 (Thu, 30 Aug 2007) | 1 line
  added note on new ssl module and deprecation of socket.ssl
........
  r57747 | martin.v.loewis | 2007-08-30 20:14:01 +0200 (Thu, 30 Aug 2007) | 1 line
  Fix popen usage.
........
  r57748 | martin.v.loewis | 2007-08-30 20:15:22 +0200 (Thu, 30 Aug 2007) | 1 line
  Fix typo.
........
  r57750 | martin.v.loewis | 2007-08-30 20:25:47 +0200 (Thu, 30 Aug 2007) | 1 line
  Bug #1746880: Correctly install DLLs into system32 folder on Win64.
........
  r57760 | martin.v.loewis | 2007-08-30 21:04:09 +0200 (Thu, 30 Aug 2007) | 1 line
  Bug #1709599: Run test_1565150 only if the file system is NTFS.
........
  r57762 | martin.v.loewis | 2007-08-30 22:10:57 +0200 (Thu, 30 Aug 2007) | 2 lines
  Bump autoconf minimum version to 2.61.
........
  r57764 | lars.gustaebel | 2007-08-30 22:24:31 +0200 (Thu, 30 Aug 2007) | 2 lines
  Warn about possible risks when extracting untrusted archives.
........
  r57769 | thomas.wouters | 2007-08-30 23:01:17 +0200 (Thu, 30 Aug 2007) | 7 lines
  Somewhat-preliminary slice-object and extended slicing support for ctypes.
  The exact behaviour of omitted and negative indices for the Pointer type may
  need a closer look (especially as it's subtly different from simple slices)
  but there's time yet before 2.6, and not enough before 3.0a1 :-)
........
											
										 
											2007-08-30 22:15:33 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-18 11:21:57 +03:00
										 |  |  |     base_url = 'http://www.pythontest.net/' | 
					
						
							|  |  |  |     robots_txt = '{}elsewhere/robots.txt'.format(base_url) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @classmethod | 
					
						
							|  |  |  |     def setUpClass(cls): | 
					
						
							| 
									
										
											  
											
												(partially)
Merged revisions 79534,79537,79539,79558,79606 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
  r79534 | florent.xicluna | 2010-03-31 23:21:54 +0200 (mer, 31 mar 2010) | 2 lines
  Fix test for xml.etree when using a non-ascii path.  And use check_warnings instead of catch_warnings.
........
  r79537 | florent.xicluna | 2010-03-31 23:40:32 +0200 (mer, 31 mar 2010) | 2 lines
  Fix typo
........
  r79539 | florent.xicluna | 2010-04-01 00:01:03 +0200 (jeu, 01 avr 2010) | 2 lines
  Replace catch_warnings with check_warnings when it makes sense.  Use assertRaises context manager to simplify some tests.
........
  r79558 | florent.xicluna | 2010-04-01 20:17:09 +0200 (jeu, 01 avr 2010) | 2 lines
  #7092: Fix some -3 warnings, and fix Lib/platform.py when the path contains a double-quote.
........
  r79606 | florent.xicluna | 2010-04-02 19:26:42 +0200 (ven, 02 avr 2010) | 2 lines
  Backport some robotparser test and skip the test if the external resource is not available.
........
											
										 
											2010-04-02 18:52:12 +00:00
										 |  |  |         support.requires('network') | 
					
						
							| 
									
										
										
										
											2020-04-29 10:36:20 +03:00
										 |  |  |         with socket_helper.transient_internet(cls.base_url): | 
					
						
							| 
									
										
										
										
											2016-09-18 11:21:57 +03:00
										 |  |  |             cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt) | 
					
						
							|  |  |  |             cls.parser.read() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def url(self, path): | 
					
						
							|  |  |  |         return '{}{}{}'.format( | 
					
						
							|  |  |  |             self.base_url, path, '/' if not os.path.splitext(path)[1] else '' | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def test_basic(self): | 
					
						
							|  |  |  |         self.assertFalse(self.parser.disallow_all) | 
					
						
							|  |  |  |         self.assertFalse(self.parser.allow_all) | 
					
						
							|  |  |  |         self.assertGreater(self.parser.mtime(), 0) | 
					
						
							|  |  |  |         self.assertFalse(self.parser.crawl_delay('*')) | 
					
						
							|  |  |  |         self.assertFalse(self.parser.request_rate('*')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def test_can_fetch(self): | 
					
						
							|  |  |  |         self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) | 
					
						
							|  |  |  |         self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) | 
					
						
							|  |  |  |         self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) | 
					
						
							|  |  |  |         self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) | 
					
						
							|  |  |  |         self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) | 
					
						
							|  |  |  |         self.assertTrue(self.parser.can_fetch('*', self.base_url)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def test_read_404(self): | 
					
						
							|  |  |  |         parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt')) | 
					
						
							|  |  |  |         parser.read() | 
					
						
							|  |  |  |         self.assertTrue(parser.allow_all) | 
					
						
							|  |  |  |         self.assertFalse(parser.disallow_all) | 
					
						
							|  |  |  |         self.assertEqual(parser.mtime(), 0) | 
					
						
							| 
									
										
										
										
											2016-09-18 20:17:58 +03:00
										 |  |  |         self.assertIsNone(parser.crawl_delay('*')) | 
					
						
							|  |  |  |         self.assertIsNone(parser.request_rate('*')) | 
					
						
							| 
									
										
										
										
											2008-07-18 20:59:44 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-02-28 15:24:47 +00:00
										 |  |  | if __name__=='__main__': | 
					
						
							| 
									
										
										
										
											2013-03-12 07:49:12 +02:00
										 |  |  |     unittest.main() |