cpython/Lib/test/test_robotparser.py

821 lines
24 KiB
Python
Raw Normal View History

import io
import os
import threading
import unittest
import urllib.robotparser
from test import support
from test.support import socket_helper
from test.support import threading_helper
from http.server import BaseHTTPRequestHandler, HTTPServer
class BaseRobotTest:
robots_txt = ''
agent = 'test_robotparser'
good = []
bad = []
site_maps = None
expected_output = None
def __init_subclass__(cls):
super().__init_subclass__()
# Remove tests that do nothing.
if issubclass(cls, unittest.TestCase):
if not cls.good:
cls.test_good_urls = None
if not cls.bad:
cls.test_bad_urls = None
if cls.expected_output is None:
cls.test_string_formatting = None
def setUp(self):
lines = io.StringIO(self.robots_txt).readlines()
self.parser = urllib.robotparser.RobotFileParser()
self.parser.parse(lines)
def get_agent_and_url(self, url):
if isinstance(url, tuple):
agent, url = url
return agent, url
return self.agent, url
def test_good_urls(self):
for url in self.good:
agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent):
self.assertTrue(self.parser.can_fetch(agent, url))
def test_bad_urls(self):
for url in self.bad:
agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent):
self.assertFalse(self.parser.can_fetch(agent, url))
def test_site_maps(self):
self.assertEqual(self.parser.site_maps(), self.site_maps)
def test_string_formatting(self):
self.assertEqual(str(self.parser), self.expected_output)
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
Disallow: /tmp/ # these will soon disappear
Disallow: /foo.html
"""
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
class SimpleExampleTest(BaseRobotTest, unittest.TestCase):
# Example from RFC 9309, section 5.1.
robots_txt = """\
User-Agent: *
Disallow: *.gif$
Disallow: /example/
Allow: /publications/
User-Agent: foobot
Disallow:/
Allow:/example/page.html
Allow:/example/allowed.gif
User-Agent: barbot
User-Agent: bazbot
Disallow: /example/page.html
User-Agent: quxbot
"""
good = [
'/', '/publications/',
('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'),
('barbot', '/'), ('barbot', '/example/'),
('barbot', '/example/allowed.gif'),
('barbot', '/example/disallowed.gif'),
('barbot', '/publications/'),
('barbot', '/publications/allowed.gif'),
('bazbot', '/'), ('bazbot', '/example/'),
('bazbot', '/example/allowed.gif'),
('bazbot', '/example/disallowed.gif'),
('bazbot', '/publications/'),
('bazbot', '/publications/allowed.gif'),
('quxbot', '/'), ('quxbot', '/example/'),
('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'),
('quxbot', '/example/disallowed.gif'),
('quxbot', '/publications/'),
('quxbot', '/publications/allowed.gif'),
]
bad = [
'/example/', '/example/page.html', '/example/allowed.gif',
'/example/disallowed.gif',
'/publications/allowed.gif',
('foobot', '/'), ('foobot', '/example/'),
('foobot', '/example/disallowed.gif'),
('foobot', '/publications/'),
('foobot', '/publications/allowed.gif'),
('barbot', '/example/page.html'),
('bazbot', '/example/page.html'),
]
class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# robots.txt for http://www.example.com/
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
# Cybermapper knows where to go.
User-agent: cybermapper
Disallow:
"""
good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
bad = ['/cyberworld/map/index.html']
class SitemapTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# robots.txt for http://www.example.com/
User-agent: *
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
"""
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html']
site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
'http://www.google.com/hostednews/sitemap_index.xml']
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# go away
User-agent: *
Disallow: /
"""
good = ['/robots.txt']
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
class BaseRequestRateTest(BaseRobotTest):
request_rate = None
crawl_delay = None
def test_request_rate(self):
parser = self.parser
for url in self.good + self.bad:
agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent):
self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
parsed_request_rate = parser.request_rate(agent)
self.assertEqual(parsed_request_rate, self.request_rate)
if self.request_rate is not None:
self.assertIsInstance(
parsed_request_rate,
urllib.robotparser.RequestRate
)
self.assertEqual(
parsed_request_rate.requests,
self.request_rate.requests
)
self.assertEqual(
parsed_request_rate.seconds,
self.request_rate.seconds
)
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = ''
good = ['/foo']
expected_output = ''
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = """\
User-agent: figtree
Crawl-delay: 3
Request-rate: 9/30
Disallow: /tmp
Disallow: /a%3cd.html
Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html
"""
agent = 'figtree'
request_rate = urllib.robotparser.RequestRate(9, 30)
crawl_delay = 3
good = [('figtree', '/foo.html')]
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
'/a%2fb.html', '/~joe/index.html']
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
agent = 'FigTree Robot libwww-perl/5.04'
class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /tmp/
Disallow: /a%3Cd.html
Disallow: /a/b.html
Disallow: /%7ejoe/index.html
Crawl-delay: 3
Request-rate: 9/banana
"""
good = ['/tmp']
bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
'/%7Ejoe/index.html']
crawl_delay = 3
class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
# From bug report #523041
robots_txt = """\
User-Agent: *
Disallow: /.
Crawl-delay: pears
"""
good = ['/foo.html']
# bug report says "/" should be denied, but that is not in the RFC
bad = []
class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
# also test that Allow and Diasallow works well with each other
robots_txt = """\
Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line Finish-up the partial conversion from int to Py_ssize_t for deque indices and length. ........ r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line Parse to the correct datatype. ........ r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line fix spacing ........ r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line fix markup ........ r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line add some documentation for 2to3 ........ r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line Finish conversion from int to Py_ssize_t. ........ r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line Convert from long to Py_ssize_t. ........ r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines Fix indentation. ........ r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line teach .bzrignore about doc tools ........ r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line document default value for fillvalue ........ r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line Issue 1592: Better error reporting for operations on closed shelves. ........ r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line fix indentation ........ r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line This sentence continues to bug me; rewrite it for the second time ........ r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line Remove extra words ........ r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines Close issue 3437 - missing state change when Allow lines are processed. Adds test cases which use Allow: as well. ........ r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines note robotparser bug fix. ........
2008-07-31 16:23:04 +00:00
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
Request-rate: whale/banana
"""
agent = 'Googlebot'
good = ['/folder1/myfile.html']
bad = ['/folder1/anotherfile.html']
Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line Finish-up the partial conversion from int to Py_ssize_t for deque indices and length. ........ r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line Parse to the correct datatype. ........ r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line fix spacing ........ r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line fix markup ........ r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line add some documentation for 2to3 ........ r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line Finish conversion from int to Py_ssize_t. ........ r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line Convert from long to Py_ssize_t. ........ r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines Fix indentation. ........ r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line teach .bzrignore about doc tools ........ r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line document default value for fillvalue ........ r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line Issue 1592: Better error reporting for operations on closed shelves. ........ r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line fix indentation ........ r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line This sentence continues to bug me; rewrite it for the second time ........ r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line Remove extra words ........ r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines Close issue 3437 - missing state change when Allow lines are processed. Adds test cases which use Allow: as well. ........ r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines note robotparser bug fix. ........
2008-07-31 16:23:04 +00:00
class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
# the order of User-agent should not matter
robots_txt = """\
Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line Finish-up the partial conversion from int to Py_ssize_t for deque indices and length. ........ r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line Parse to the correct datatype. ........ r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line fix spacing ........ r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line fix markup ........ r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line add some documentation for 2to3 ........ r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line Finish conversion from int to Py_ssize_t. ........ r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line Convert from long to Py_ssize_t. ........ r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines Fix indentation. ........ r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line teach .bzrignore about doc tools ........ r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line document default value for fillvalue ........ r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line Issue 1592: Better error reporting for operations on closed shelves. ........ r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line fix indentation ........ r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line This sentence continues to bug me; rewrite it for the second time ........ r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line Remove extra words ........ r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines Close issue 3437 - missing state change when Allow lines are processed. Adds test cases which use Allow: as well. ........ r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines note robotparser bug fix. ........
2008-07-31 16:23:04 +00:00
User-agent: Googlebot
Disallow: /
Allow: /folder1/
Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line Finish-up the partial conversion from int to Py_ssize_t for deque indices and length. ........ r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line Parse to the correct datatype. ........ r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line fix spacing ........ r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line fix markup ........ r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line add some documentation for 2to3 ........ r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line Finish conversion from int to Py_ssize_t. ........ r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line Convert from long to Py_ssize_t. ........ r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines Fix indentation. ........ r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line teach .bzrignore about doc tools ........ r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line document default value for fillvalue ........ r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line Issue 1592: Better error reporting for operations on closed shelves. ........ r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line fix indentation ........ r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line This sentence continues to bug me; rewrite it for the second time ........ r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line Remove extra words ........ r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines Close issue 3437 - missing state change when Allow lines are processed. Adds test cases which use Allow: as well. ........ r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines note robotparser bug fix. ........
2008-07-31 16:23:04 +00:00
User-agent: Googlebot-Mobile
Allow: /
Disallow: /folder1/
"""
agent = 'Googlebot'
bad = ['/something.jpg']
good = ['/folder1/myfile.html']
Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line Finish-up the partial conversion from int to Py_ssize_t for deque indices and length. ........ r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line Parse to the correct datatype. ........ r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line fix spacing ........ r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line fix markup ........ r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line add some documentation for 2to3 ........ r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line Finish conversion from int to Py_ssize_t. ........ r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line Convert from long to Py_ssize_t. ........ r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines Fix indentation. ........ r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line teach .bzrignore about doc tools ........ r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line document default value for fillvalue ........ r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line Issue 1592: Better error reporting for operations on closed shelves. ........ r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line fix indentation ........ r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line This sentence continues to bug me; rewrite it for the second time ........ r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line Remove extra words ........ r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines Close issue 3437 - missing state change when Allow lines are processed. Adds test cases which use Allow: as well. ........ r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines note robotparser bug fix. ........
2008-07-31 16:23:04 +00:00
class UserAgentGoogleMobileTest(UserAgentOrderingTest):
agent = 'Googlebot-mobile'
bad = ['/folder1/myfile.html']
good = ['/something.jpg']
Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line Finish-up the partial conversion from int to Py_ssize_t for deque indices and length. ........ r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line Parse to the correct datatype. ........ r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line fix spacing ........ r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line fix markup ........ r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line add some documentation for 2to3 ........ r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line Finish conversion from int to Py_ssize_t. ........ r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line Convert from long to Py_ssize_t. ........ r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines Fix indentation. ........ r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line teach .bzrignore about doc tools ........ r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line document default value for fillvalue ........ r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line Issue 1592: Better error reporting for operations on closed shelves. ........ r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line fix indentation ........ r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line This sentence continues to bug me; rewrite it for the second time ........ r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line Remove extra words ........ r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines Close issue 3437 - missing state change when Allow lines are processed. Adds test cases which use Allow: as well. ........ r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines note robotparser bug fix. ........
2008-07-31 16:23:04 +00:00
class LongestMatchTest(BaseRobotTest, unittest.TestCase):
# Based on example from RFC 9309, section 5.2.
robots_txt = """\
User-agent: *
Allow: /example/page/
Disallow: /example/page/disallowed.gif
Allow: /example/
"""
good = ['/example/', '/example/page/']
bad = ['/example/page/disallowed.gif']
class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Allow: /example/page/
Disallow: *.gif
Allow: /example/
"""
good = ['/example/', '/example/page/']
bad = ['/example/page/disallowed.gif', '/x.gif']
class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /spam
Allow: /spam
Disallow: /spam
"""
good = ['/spam', '/spam/']
class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /spam
Allow: /spam$
Disallow: /spam
Disallow: /eggs$
Allow: /eggs
Disallow: /eggs$
"""
good = ['/spam', '/eggs', '/eggs/']
bad = ['/spam/']
class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /spam
Allow: *am
Disallow: /spam
Disallow: *gs
Allow: /eggs
Disallow: *gs
"""
good = ['/spam', '/eggs', '/spam/', '/eggs/']
class MergeGroupsTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: spambot
Disallow: /some/path
User-agent: spambot
Disallow: /another/path
"""
agent = 'spambot'
bad = ['/some/path', '/another/path']
class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: spambot
Disallow: /some/path
User-agent: eggsbot
Disallow: /another/path
"""
good = [('spambot', '/'), ('spambot', '/another/path'),
('eggsbot', '/'), ('eggsbot', '/some/path')]
bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')]
expected_output = """\
User-agent: spambot
Disallow: /some/path
User-agent: eggsbot
Disallow: /another/path\
"""
class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: spambot
User-agent: eggsbot
Disallow: /some/path
Disallow: /another/path
"""
good = [('spambot', '/'), ('eggsbot', '/')]
bad = [
('spambot', '/some/path'), ('spambot', '/another/path'),
('eggsbot', '/some/path'), ('eggsbot', '/another/path'),
]
expected_output = """\
User-agent: spambot
User-agent: eggsbot
Disallow: /some/path
Disallow: /another/path\
"""
class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
Disallow: /some/path
User-agent: *
Disallow: /another/path
"""
good = ['/', '/some/path']
bad = ['/another/path']
expected_output = """\
User-agent: *
Disallow: /another/path\
"""
class EmptyGroupTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /some/path
User-agent: spambot
"""
agent = 'spambot'
good = ['/', '/some/path']
expected_output = """\
User-agent: *
Disallow: /some/path
User-agent: spambot
Allow:\
"""
class WeirdPathTest(BaseRobotTest, unittest.TestCase):
robots_txt = f"""\
User-agent: *
Disallow: /a$$$
Disallow: /b$z
Disallow: /c***
Disallow: /d***z
Disallow: /e*$**$$
Disallow: /f*$**$$z
Disallow: /g$*$$**
Disallow: /h$*$$**z
"""
good = ['/ax', '/a$$', '/b', '/bz', '/b$z', '/d', '/f', '/fz',
'/f$$$z', '/fx$y$$z', '/gx', '/g$$$', '/g$x$$y', '/h', '/hz',
'/h$$$z', '/h$x$$yz']
bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy',
'/e$$', '/ex$y$', '/g']
expected_output = """\
User-agent: *
Disallow: /a$
Disallow: /c*
Disallow: /d*z
Disallow: /e*$
Disallow: /g$\
"""
class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase):
# This test would take many years if use naive translation to regular
# expression (* -> .*).
N = 50
robots_txt = f"""\
User-agent: *
Disallow: /{'*a'*N}*b
"""
good = ['/' + 'a'*N + 'a']
bad = ['/' + 'a'*N + 'b']
Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line Finish-up the partial conversion from int to Py_ssize_t for deque indices and length. ........ r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line Parse to the correct datatype. ........ r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line fix spacing ........ r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line fix markup ........ r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line add some documentation for 2to3 ........ r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line Finish conversion from int to Py_ssize_t. ........ r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line Convert from long to Py_ssize_t. ........ r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines Fix indentation. ........ r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line teach .bzrignore about doc tools ........ r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line document default value for fillvalue ........ r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line Issue 1592: Better error reporting for operations on closed shelves. ........ r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line fix indentation ........ r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line This sentence continues to bug me; rewrite it for the second time ........ r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line Remove extra words ........ r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines Close issue 3437 - missing state change when Allow lines are processed. Adds test cases which use Allow: as well. ........ r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines note robotparser bug fix. ........
2008-07-31 16:23:04 +00:00
class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
# see issue #6325 for details
robots_txt = """\
User-agent: *
Disallow: /some/path?name=value
Disallow: /another/path?
Disallow: /yet/one/path?name=value&more
"""
good = ['/some/path', '/some/path?',
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
'/another/path', '/another/path%3F',
'/yet/one/path?name=value%26more',
'/some/pathxname=value']
bad = ['/some/path?name=value'
'/another/path?', '/another/path?name=value',
'/yet/one/path?name=value&more']
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /a1/Z-._~ # unreserved characters
Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
Disallow: /u2/%f0%9f%90%8d
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
Disallow: /v2/%f0
Disallow: /v3/\udcf0 # raw non-ASCII octet
Disallow: /p1%xy # raw percent
Disallow: /p2%
Disallow: /p3%25xy # percent-encoded percent
Disallow: /p4%2525xy # double percent-encoded percent
Disallow: /john%20smith # space
Disallow: /john doe
Disallow: /trailingspace%20
Disallow: /question%3Fq=v # not query
Disallow: /hash%23f # not fragment
Disallow: /dollar%24
Disallow: /asterisk%2A
Disallow: /sub/dir
Disallow: /slash%2F
Disallow: /query/question?q=%3F
Disallow: /query/raw/question?q=?
Disallow: /query/eq?q%3Dv
Disallow: /query/amp?q=v%26a
"""
good = [
'/u1/%F0', '/u1/%f0',
'/u2/%F0', '/u2/%f0',
'/u3/%F0', '/u3/%f0',
'/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
'/question?q=v',
'/dollar', '/asterisk',
'/query/eq?q=v',
'/query/amp?q=v&a',
]
bad = [
'/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
'/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
'/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
'/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
'/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
'/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
'/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
'/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
'/p1%xy', '/p1%25xy',
'/p2%', '/p2%25', '/p2%2525', '/p2%xy',
'/p3%xy', '/p3%25xy',
'/p4%2525xy',
'/john%20smith', '/john smith',
'/john%20doe', '/john doe',
'/trailingspace%20', '/trailingspace ',
'/question%3Fq=v',
'/hash#f', '/hash%23f',
'/dollar$', '/dollar%24',
'/asterisk*', '/asterisk%2A',
'/sub/dir', '/sub%2Fdir',
'/slash%2F', '/slash/',
'/query/question?q=?', '/query/question?q=%3F',
'/query/raw/question?q=?', '/query/raw/question?q=%3F',
'/query/eq?q%3Dv',
'/query/amp?q=v%26a',
]
# other reserved characters
for c in ":/#[]@!$&'()*+,;=":
robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
bad.append(f'/raw{c}')
bad.append(f'/raw%{ord(c):02X}')
bad.append(f'/pc{c}')
bad.append(f'/pc%{ord(c):02X}')
Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line Finish-up the partial conversion from int to Py_ssize_t for deque indices and length. ........ r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line Parse to the correct datatype. ........ r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line fix spacing ........ r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line fix markup ........ r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line add some documentation for 2to3 ........ r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line Finish conversion from int to Py_ssize_t. ........ r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line Convert from long to Py_ssize_t. ........ r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines Fix indentation. ........ r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line teach .bzrignore about doc tools ........ r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line document default value for fillvalue ........ r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line Issue 1592: Better error reporting for operations on closed shelves. ........ r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line fix indentation ........ r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line This sentence continues to bug me; rewrite it for the second time ........ r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line Remove extra words ........ r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines Close issue 3437 - missing state change when Allow lines are processed. Adds test cases which use Allow: as well. ........ r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines note robotparser bug fix. ........
2008-07-31 16:23:04 +00:00
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/
"""
request_rate = urllib.robotparser.RequestRate(3, 15)
crawl_delay = 1
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html']
class StringFormattingTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
# Cybermapper knows where to go.
User-agent: cybermapper
Disallow: /some/path
"""
expected_output = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/
User-agent: cybermapper
Disallow: /some/path\
"""
class ConstructedStringFormattingTest(unittest.TestCase):
def test_empty(self):
parser = urllib.robotparser.RobotFileParser()
self.assertEqual(str(parser), '')
def test_group_without_rules(self):
parser = urllib.robotparser.RobotFileParser()
entry = urllib.robotparser.Entry()
entry.useragents = ['spambot']
parser._add_entry(entry)
entry = urllib.robotparser.Entry()
entry.useragents = ['hambot']
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
parser._add_entry(entry)
entry = urllib.robotparser.Entry()
entry.useragents = ['eggsbot']
parser._add_entry(entry)
self.assertEqual(str(parser), """\
User-agent: spambot
Allow:
User-agent: hambot
Disallow: /ham
User-agent: eggsbot
Allow:\
""")
def test_group_without_user_agent(self):
parser = urllib.robotparser.RobotFileParser()
entry = urllib.robotparser.Entry()
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
parser._add_entry(entry)
entry = urllib.robotparser.Entry()
entry.useragents = ['spambot']
entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)]
parser._add_entry(entry)
entry = urllib.robotparser.Entry()
entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)]
parser._add_entry(entry)
self.assertEqual(str(parser), """\
User-agent: spambot
Disallow: /spam\
""")
@unittest.skipUnless(
support.has_socket_support,
"Socket server requires working socket."
)
class BaseLocalNetworkTestCase:
@classmethod
def setUpClass(cls):
# clear _opener global variable
cls.addClassCleanup(urllib.request.urlcleanup)
cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
cls.addClassCleanup(cls.server.server_close)
t = threading.Thread(
name='HTTPServer serving',
target=cls.server.serve_forever,
# Short poll interval to make the test finish quickly.
# Time between requests is short enough that we won't wake
# up spuriously too many times.
kwargs={'poll_interval':0.01})
cls.enterClassContext(threading_helper.start_threads([t]))
cls.addClassCleanup(cls.server.shutdown)
SAMPLE_ROBOTS_TXT = b'''\
User-agent: test_robotparser
Disallow: /utf8/\xf0\x9f\x90\x8d
Disallow: /non-utf8/\xf0
Disallow: //[spam]/path
'''
class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.end_headers()
self.wfile.write(SAMPLE_ROBOTS_TXT)
def log_message(self, format, *args):
pass
def testRead(self):
# Test that reading a weird robots.txt doesn't fail.
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + '/robots.txt'
parser = urllib.robotparser.RobotFileParser()
parser.set_url(robots_url)
parser.read()
# And it can even interpret the weird paths in some reasonable way.
agent = 'test_robotparser'
self.assertTrue(parser.can_fetch(agent, robots_url))
self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.send_error(self.server.return_code)
def log_message(self, format, *args):
pass
def setUp(self):
# Make sure that a valid code is set in the test.
self.server.return_code = None
def testUnauthorized(self):
self.server.return_code = 401
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertFalse(parser.can_fetch("*", robots_url))
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
def testForbidden(self):
self.server.return_code = 403
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertFalse(parser.can_fetch("*", robots_url))
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
def testNotFound(self):
self.server.return_code = 404
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertTrue(parser.can_fetch("*", robots_url))
self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))
def testTeapot(self):
self.server.return_code = 418
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertTrue(parser.can_fetch("*", robots_url))
self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))
def testServiceUnavailable(self):
self.server.return_code = 503
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertFalse(parser.can_fetch("*", robots_url))
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))
@support.requires_working_socket()
class NetworkTestCase(unittest.TestCase):
Merged revisions 57620-57771 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r57771 | thomas.wouters | 2007-08-30 23:54:39 +0200 (Thu, 30 Aug 2007) | 5 lines Don't lie in __all__ attributes when SSL is not available: only add the SSL classes when they are actually created. ........ r57620 | walter.doerwald | 2007-08-28 18:38:26 +0200 (Tue, 28 Aug 2007) | 5 lines Fix title endtag in HTMLCalender.formatyearpage(). Fix documentation for HTMLCalender.formatyearpage() (there's no themonth parameter). This fixes issue1046. ........ r57622 | georg.brandl | 2007-08-28 20:54:44 +0200 (Tue, 28 Aug 2007) | 2 lines Add a crasher for the thread-unsafety of file objects. ........ r57626 | skip.montanaro | 2007-08-29 01:22:52 +0200 (Wed, 29 Aug 2007) | 1 line fixes 813986 ........ r57628 | walter.doerwald | 2007-08-29 01:35:33 +0200 (Wed, 29 Aug 2007) | 2 lines Fix test output. ........ r57631 | skip.montanaro | 2007-08-29 03:24:11 +0200 (Wed, 29 Aug 2007) | 2 lines Install pygettext (once the scriptsinstall target is working again). ........ r57633 | skip.montanaro | 2007-08-29 03:33:45 +0200 (Wed, 29 Aug 2007) | 2 lines Recent items. ........ r57650 | neal.norwitz | 2007-08-29 08:15:33 +0200 (Wed, 29 Aug 2007) | 1 line Add Bill as a developer ........ r57651 | facundo.batista | 2007-08-29 12:28:28 +0200 (Wed, 29 Aug 2007) | 5 lines Ignore test failures caused by 'resource temporarily unavailable' exceptions raised during FailingServerTestCase tests. [GSoC - Alan McIntyre] ........ r57680 | bill.janssen | 2007-08-30 00:35:05 +0200 (Thu, 30 Aug 2007) | 17 lines This contains a number of things: 1) Improve the documentation of the SSL module, with a fuller explanation of certificate usage, another reference, proper formatting of this and that. 2) Fix Windows bug in ssl.py, and general bug in sslsocket.close(). Remove some unused code from ssl.py. Allow accept() to be called on sslsocket sockets. 3) Use try-except-else in import of ssl in socket.py. Deprecate use of socket.ssl(). 4) Remove use of socket.ssl() in every library module, except for test_socket_ssl.py and test_ssl.py. ........ r57714 | georg.brandl | 2007-08-30 12:09:42 +0200 (Thu, 30 Aug 2007) | 2 lines Stronger urge to convert filenames to str before using them as argument to ZipFile.write(). ........ r57716 | georg.brandl | 2007-08-30 12:38:56 +0200 (Thu, 30 Aug 2007) | 2 lines Patch #1680959: add test suite for pipes module. ........ r57717 | georg.brandl | 2007-08-30 14:32:23 +0200 (Thu, 30 Aug 2007) | 3 lines * Skip test_pipes on non-POSIX. * Don't raise TestSkipped within a test function. ........ r57723 | mark.summerfield | 2007-08-30 17:03:03 +0200 (Thu, 30 Aug 2007) | 3 lines Added more cross-references. ........ r57726 | walter.doerwald | 2007-08-30 17:30:09 +0200 (Thu, 30 Aug 2007) | 2 lines Rewrap line. ........ r57727 | walter.doerwald | 2007-08-30 17:34:55 +0200 (Thu, 30 Aug 2007) | 2 lines Set startinpos before calling the error handler. ........ r57730 | bill.janssen | 2007-08-30 19:07:28 +0200 (Thu, 30 Aug 2007) | 3 lines Added docstrings to methods and functions. ........ r57743 | bill.janssen | 2007-08-30 20:08:06 +0200 (Thu, 30 Aug 2007) | 1 line added note on new ssl module and deprecation of socket.ssl ........ r57747 | martin.v.loewis | 2007-08-30 20:14:01 +0200 (Thu, 30 Aug 2007) | 1 line Fix popen usage. ........ r57748 | martin.v.loewis | 2007-08-30 20:15:22 +0200 (Thu, 30 Aug 2007) | 1 line Fix typo. ........ r57750 | martin.v.loewis | 2007-08-30 20:25:47 +0200 (Thu, 30 Aug 2007) | 1 line Bug #1746880: Correctly install DLLs into system32 folder on Win64. ........ r57760 | martin.v.loewis | 2007-08-30 21:04:09 +0200 (Thu, 30 Aug 2007) | 1 line Bug #1709599: Run test_1565150 only if the file system is NTFS. ........ r57762 | martin.v.loewis | 2007-08-30 22:10:57 +0200 (Thu, 30 Aug 2007) | 2 lines Bump autoconf minimum version to 2.61. ........ r57764 | lars.gustaebel | 2007-08-30 22:24:31 +0200 (Thu, 30 Aug 2007) | 2 lines Warn about possible risks when extracting untrusted archives. ........ r57769 | thomas.wouters | 2007-08-30 23:01:17 +0200 (Thu, 30 Aug 2007) | 7 lines Somewhat-preliminary slice-object and extended slicing support for ctypes. The exact behaviour of omitted and negative indices for the Pointer type may need a closer look (especially as it's subtly different from simple slices) but there's time yet before 2.6, and not enough before 3.0a1 :-) ........
2007-08-30 22:15:33 +00:00
base_url = 'http://www.pythontest.net/'
robots_txt = '{}elsewhere/robots.txt'.format(base_url)
@classmethod
def setUpClass(cls):
support.requires('network')
cls.addClassCleanup(urllib.request.urlcleanup)
with socket_helper.transient_internet(cls.base_url):
cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
cls.parser.read()
def url(self, path):
return '{}{}{}'.format(
self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
)
def test_basic(self):
self.assertFalse(self.parser.disallow_all)
self.assertFalse(self.parser.allow_all)
self.assertGreater(self.parser.mtime(), 0)
self.assertFalse(self.parser.crawl_delay('*'))
self.assertFalse(self.parser.request_rate('*'))
def test_can_fetch(self):
self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
self.assertTrue(self.parser.can_fetch('*', self.base_url))
def test_read_404(self):
parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
parser.read()
self.assertTrue(parser.allow_all)
self.assertFalse(parser.disallow_all)
self.assertEqual(parser.mtime(), 0)
self.assertIsNone(parser.crawl_delay('*'))
self.assertIsNone(parser.request_rate('*'))
if __name__=='__main__':
unittest.main()