mirror of
https://github.com/python/cpython.git
synced 2025-11-01 06:01:29 +00:00
gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502)
* Don't fail trying to parse weird patterns. * Don't fail trying to decode non-UTF-8 "robots.txt" files. * No longer ignore trailing "?" in patterns and URLs. * Distinguish raw special characters "?", "=" and "&" from the percent-encoded ones. * Remove tests that do nothing.
This commit is contained in:
parent
ed522ed211
commit
cb7ef18d70
4 changed files with 172 additions and 31 deletions
|
|
@ -16,6 +16,14 @@ class BaseRobotTest:
|
||||||
bad = []
|
bad = []
|
||||||
site_maps = None
|
site_maps = None
|
||||||
|
|
||||||
|
def __init_subclass__(cls):
|
||||||
|
super().__init_subclass__()
|
||||||
|
# Remove tests that do nothing.
|
||||||
|
if not cls.good:
|
||||||
|
cls.test_good_urls = None
|
||||||
|
if not cls.bad:
|
||||||
|
cls.test_bad_urls = None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
lines = io.StringIO(self.robots_txt).readlines()
|
lines = io.StringIO(self.robots_txt).readlines()
|
||||||
self.parser = urllib.robotparser.RobotFileParser()
|
self.parser = urllib.robotparser.RobotFileParser()
|
||||||
|
|
@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
User-agent: *
|
User-agent: *
|
||||||
Disallow: /some/path?name=value
|
Disallow: /some/path?name=value
|
||||||
|
Disallow: /another/path?
|
||||||
|
Disallow: /yet/one/path?name=value&more
|
||||||
"""
|
"""
|
||||||
good = ['/some/path']
|
good = ['/some/path', '/some/path?',
|
||||||
bad = ['/some/path?name=value']
|
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
|
||||||
|
'/another/path', '/another/path%3F',
|
||||||
|
'/yet/one/path?name=value%26more']
|
||||||
|
bad = ['/some/path?name=value'
|
||||||
|
'/another/path?', '/another/path?name=value',
|
||||||
|
'/yet/one/path?name=value&more']
|
||||||
|
|
||||||
|
|
||||||
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
|
@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||||
bad = ['/some/path']
|
bad = ['/some/path']
|
||||||
|
|
||||||
|
|
||||||
class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
|
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
|
||||||
# normalize the URL first (#17403)
|
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
User-agent: *
|
User-agent: *
|
||||||
Allow: /some/path?
|
Disallow: /a1/Z-._~ # unreserved characters
|
||||||
Disallow: /another/path?
|
Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
|
||||||
|
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
|
||||||
|
Disallow: /u2/%f0%9f%90%8d
|
||||||
|
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
|
||||||
|
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
|
||||||
|
Disallow: /v2/%f0
|
||||||
|
Disallow: /v3/\udcf0 # raw non-ASCII octet
|
||||||
|
Disallow: /p1%xy # raw percent
|
||||||
|
Disallow: /p2%
|
||||||
|
Disallow: /p3%25xy # percent-encoded percent
|
||||||
|
Disallow: /p4%2525xy # double percent-encoded percent
|
||||||
|
Disallow: /john%20smith # space
|
||||||
|
Disallow: /john doe
|
||||||
|
Disallow: /trailingspace%20
|
||||||
|
Disallow: /question%3Fq=v # not query
|
||||||
|
Disallow: /hash%23f # not fragment
|
||||||
|
Disallow: /dollar%24
|
||||||
|
Disallow: /asterisk%2A
|
||||||
|
Disallow: /sub/dir
|
||||||
|
Disallow: /slash%2F
|
||||||
|
Disallow: /query/question?q=%3F
|
||||||
|
Disallow: /query/raw/question?q=?
|
||||||
|
Disallow: /query/eq?q%3Dv
|
||||||
|
Disallow: /query/amp?q=v%26a
|
||||||
"""
|
"""
|
||||||
good = ['/some/path?']
|
good = [
|
||||||
bad = ['/another/path?']
|
'/u1/%F0', '/u1/%f0',
|
||||||
|
'/u2/%F0', '/u2/%f0',
|
||||||
|
'/u3/%F0', '/u3/%f0',
|
||||||
|
'/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
|
||||||
|
'/question?q=v',
|
||||||
|
'/dollar', '/asterisk',
|
||||||
|
'/query/eq?q=v',
|
||||||
|
'/query/amp?q=v&a',
|
||||||
|
]
|
||||||
|
bad = [
|
||||||
|
'/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
|
||||||
|
'/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
|
||||||
|
'/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
|
||||||
|
'/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
|
||||||
|
'/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
|
||||||
|
'/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
|
||||||
|
'/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
|
||||||
|
'/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
|
||||||
|
'/p1%xy', '/p1%25xy',
|
||||||
|
'/p2%', '/p2%25', '/p2%2525', '/p2%xy',
|
||||||
|
'/p3%xy', '/p3%25xy',
|
||||||
|
'/p4%2525xy',
|
||||||
|
'/john%20smith', '/john smith',
|
||||||
|
'/john%20doe', '/john doe',
|
||||||
|
'/trailingspace%20', '/trailingspace ',
|
||||||
|
'/question%3Fq=v',
|
||||||
|
'/hash#f', '/hash%23f',
|
||||||
|
'/dollar$', '/dollar%24',
|
||||||
|
'/asterisk*', '/asterisk%2A',
|
||||||
|
'/sub/dir', '/sub%2Fdir',
|
||||||
|
'/slash%2F', '/slash/',
|
||||||
|
'/query/question?q=?', '/query/question?q=%3F',
|
||||||
|
'/query/raw/question?q=?', '/query/raw/question?q=%3F',
|
||||||
|
'/query/eq?q%3Dv',
|
||||||
|
'/query/amp?q=v%26a',
|
||||||
|
]
|
||||||
|
# other reserved characters
|
||||||
|
for c in ":/#[]@!$&'()*+,;=":
|
||||||
|
robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
|
||||||
|
bad.append(f'/raw{c}')
|
||||||
|
bad.append(f'/raw%{ord(c):02X}')
|
||||||
|
bad.append(f'/pc{c}')
|
||||||
|
bad.append(f'/pc%{ord(c):02X}')
|
||||||
|
|
||||||
|
|
||||||
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
|
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
|
||||||
|
|
@ -299,26 +378,17 @@ def test_string_formatting(self):
|
||||||
self.assertEqual(str(self.parser), self.expected_output)
|
self.assertEqual(str(self.parser), self.expected_output)
|
||||||
|
|
||||||
|
|
||||||
class RobotHandler(BaseHTTPRequestHandler):
|
|
||||||
|
|
||||||
def do_GET(self):
|
|
||||||
self.send_error(403, "Forbidden access")
|
|
||||||
|
|
||||||
def log_message(self, format, *args):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@unittest.skipUnless(
|
@unittest.skipUnless(
|
||||||
support.has_socket_support,
|
support.has_socket_support,
|
||||||
"Socket server requires working socket."
|
"Socket server requires working socket."
|
||||||
)
|
)
|
||||||
class PasswordProtectedSiteTestCase(unittest.TestCase):
|
class BaseLocalNetworkTestCase:
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
# clear _opener global variable
|
# clear _opener global variable
|
||||||
self.addCleanup(urllib.request.urlcleanup)
|
self.addCleanup(urllib.request.urlcleanup)
|
||||||
|
|
||||||
self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
|
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
|
||||||
|
|
||||||
self.t = threading.Thread(
|
self.t = threading.Thread(
|
||||||
name='HTTPServer serving',
|
name='HTTPServer serving',
|
||||||
|
|
@ -335,6 +405,57 @@ def tearDown(self):
|
||||||
self.t.join()
|
self.t.join()
|
||||||
self.server.server_close()
|
self.server.server_close()
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_ROBOTS_TXT = b'''\
|
||||||
|
User-agent: test_robotparser
|
||||||
|
Disallow: /utf8/\xf0\x9f\x90\x8d
|
||||||
|
Disallow: /non-utf8/\xf0
|
||||||
|
Disallow: //[spam]/path
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
|
||||||
|
class RobotHandler(BaseHTTPRequestHandler):
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
self.send_response(200)
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(SAMPLE_ROBOTS_TXT)
|
||||||
|
|
||||||
|
def log_message(self, format, *args):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@threading_helper.reap_threads
|
||||||
|
def testRead(self):
|
||||||
|
# Test that reading a weird robots.txt doesn't fail.
|
||||||
|
addr = self.server.server_address
|
||||||
|
url = f'http://{socket_helper.HOST}:{addr[1]}'
|
||||||
|
robots_url = url + '/robots.txt'
|
||||||
|
parser = urllib.robotparser.RobotFileParser()
|
||||||
|
parser.set_url(robots_url)
|
||||||
|
parser.read()
|
||||||
|
# And it can even interpret the weird paths in some reasonable way.
|
||||||
|
agent = 'test_robotparser'
|
||||||
|
self.assertTrue(parser.can_fetch(agent, robots_url))
|
||||||
|
self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
|
||||||
|
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
|
||||||
|
self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
|
||||||
|
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
|
||||||
|
self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
|
||||||
|
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
|
||||||
|
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
|
||||||
|
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
|
||||||
|
|
||||||
|
|
||||||
|
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
|
||||||
|
class RobotHandler(BaseHTTPRequestHandler):
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
self.send_error(403, "Forbidden access")
|
||||||
|
|
||||||
|
def log_message(self, format, *args):
|
||||||
|
pass
|
||||||
|
|
||||||
@threading_helper.reap_threads
|
@threading_helper.reap_threads
|
||||||
def testPasswordProtectedSite(self):
|
def testPasswordProtectedSite(self):
|
||||||
addr = self.server.server_address
|
addr = self.server.server_address
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import collections
|
import collections
|
||||||
|
import re
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
@ -20,6 +21,19 @@
|
||||||
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
|
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(path):
|
||||||
|
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
|
||||||
|
return urllib.parse.quote(unquoted, errors='surrogateescape')
|
||||||
|
|
||||||
|
def normalize_path(path):
|
||||||
|
path, sep, query = path.partition('?')
|
||||||
|
path = normalize(path)
|
||||||
|
if sep:
|
||||||
|
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
|
||||||
|
path += '?' + query
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
class RobotFileParser:
|
class RobotFileParser:
|
||||||
""" This class provides a set of methods to read, parse and answer
|
""" This class provides a set of methods to read, parse and answer
|
||||||
questions about a single robots.txt file.
|
questions about a single robots.txt file.
|
||||||
|
|
@ -55,7 +69,7 @@ def modified(self):
|
||||||
def set_url(self, url):
|
def set_url(self, url):
|
||||||
"""Sets the URL referring to a robots.txt file."""
|
"""Sets the URL referring to a robots.txt file."""
|
||||||
self.url = url
|
self.url = url
|
||||||
self.host, self.path = urllib.parse.urlparse(url)[1:3]
|
self.host, self.path = urllib.parse.urlsplit(url)[1:3]
|
||||||
|
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Reads the robots.txt URL and feeds it to the parser."""
|
"""Reads the robots.txt URL and feeds it to the parser."""
|
||||||
|
|
@ -69,7 +83,7 @@ def read(self):
|
||||||
err.close()
|
err.close()
|
||||||
else:
|
else:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
self.parse(raw.decode("utf-8").splitlines())
|
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
|
||||||
|
|
||||||
def _add_entry(self, entry):
|
def _add_entry(self, entry):
|
||||||
if "*" in entry.useragents:
|
if "*" in entry.useragents:
|
||||||
|
|
@ -113,7 +127,7 @@ def parse(self, lines):
|
||||||
line = line.split(':', 1)
|
line = line.split(':', 1)
|
||||||
if len(line) == 2:
|
if len(line) == 2:
|
||||||
line[0] = line[0].strip().lower()
|
line[0] = line[0].strip().lower()
|
||||||
line[1] = urllib.parse.unquote(line[1].strip())
|
line[1] = line[1].strip()
|
||||||
if line[0] == "user-agent":
|
if line[0] == "user-agent":
|
||||||
if state == 2:
|
if state == 2:
|
||||||
self._add_entry(entry)
|
self._add_entry(entry)
|
||||||
|
|
@ -167,10 +181,11 @@ def can_fetch(self, useragent, url):
|
||||||
return False
|
return False
|
||||||
# search for given user agent matches
|
# search for given user agent matches
|
||||||
# the first match counts
|
# the first match counts
|
||||||
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
|
# TODO: The private API is used in order to preserve an empty query.
|
||||||
url = urllib.parse.urlunparse(('','',parsed_url.path,
|
# This is temporary until the public API starts supporting this feature.
|
||||||
parsed_url.params,parsed_url.query, parsed_url.fragment))
|
parsed_url = urllib.parse._urlsplit(url, '')
|
||||||
url = urllib.parse.quote(url)
|
url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
|
||||||
|
url = normalize_path(url)
|
||||||
if not url:
|
if not url:
|
||||||
url = "/"
|
url = "/"
|
||||||
for entry in self.entries:
|
for entry in self.entries:
|
||||||
|
|
@ -213,7 +228,6 @@ def __str__(self):
|
||||||
entries = entries + [self.default_entry]
|
entries = entries + [self.default_entry]
|
||||||
return '\n\n'.join(map(str, entries))
|
return '\n\n'.join(map(str, entries))
|
||||||
|
|
||||||
|
|
||||||
class RuleLine:
|
class RuleLine:
|
||||||
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
||||||
(allowance==False) followed by a path."""
|
(allowance==False) followed by a path."""
|
||||||
|
|
@ -221,8 +235,7 @@ def __init__(self, path, allowance):
|
||||||
if path == '' and not allowance:
|
if path == '' and not allowance:
|
||||||
# an empty value means allow all
|
# an empty value means allow all
|
||||||
allowance = True
|
allowance = True
|
||||||
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
|
self.path = normalize_path(path)
|
||||||
self.path = urllib.parse.quote(path)
|
|
||||||
self.allowance = allowance
|
self.allowance = allowance
|
||||||
|
|
||||||
def applies_to(self, filename):
|
def applies_to(self, filename):
|
||||||
|
|
@ -268,7 +281,7 @@ def applies_to(self, useragent):
|
||||||
def allowance(self, filename):
|
def allowance(self, filename):
|
||||||
"""Preconditions:
|
"""Preconditions:
|
||||||
- our agent applies to this entry
|
- our agent applies to this entry
|
||||||
- filename is URL decoded"""
|
- filename is URL encoded"""
|
||||||
for line in self.rulelines:
|
for line in self.rulelines:
|
||||||
if line.applies_to(filename):
|
if line.applies_to(filename):
|
||||||
return line.allowance
|
return line.allowance
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
Fix parsing errors in the :mod:`urllib.robotparser` module.
|
||||||
|
Don't fail trying to parse weird paths.
|
||||||
|
Don't fail trying to decode non-UTF-8 ``robots.txt`` files.
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
Fix normalization of the ``robots.txt`` rules and URLs in the
|
||||||
|
:mod:`urllib.robotparser` module. No longer ignore trailing ``?``.
|
||||||
|
Distinguish raw special characters ``?``, ``=`` and ``&`` from the
|
||||||
|
percent-encoded ones.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue