gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502)

* Don't fail trying to parse weird patterns.
* Don't fail trying to decode non-UTF-8 "robots.txt" files.
* No longer ignore trailing "?" in patterns and URLs.
* Distinguish raw special characters "?", "=" and "&" from the
  percent-encoded ones.
* Remove tests that do nothing.
This commit is contained in:
Serhiy Storchaka 2025-09-05 18:58:42 +03:00 committed by GitHub
parent ed522ed211
commit cb7ef18d70
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 172 additions and 31 deletions

View file

@ -16,6 +16,14 @@ class BaseRobotTest:
bad = [] bad = []
site_maps = None site_maps = None
def __init_subclass__(cls):
super().__init_subclass__()
# Remove tests that do nothing.
if not cls.good:
cls.test_good_urls = None
if not cls.bad:
cls.test_bad_urls = None
def setUp(self): def setUp(self):
lines = io.StringIO(self.robots_txt).readlines() lines = io.StringIO(self.robots_txt).readlines()
self.parser = urllib.robotparser.RobotFileParser() self.parser = urllib.robotparser.RobotFileParser()
@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\ robots_txt = """\
User-agent: * User-agent: *
Disallow: /some/path?name=value Disallow: /some/path?name=value
Disallow: /another/path?
Disallow: /yet/one/path?name=value&more
""" """
good = ['/some/path'] good = ['/some/path', '/some/path?',
bad = ['/some/path?name=value'] '/some/path%3Fname=value', '/some/path?name%3Dvalue',
'/another/path', '/another/path%3F',
'/yet/one/path?name=value%26more']
bad = ['/some/path?name=value'
'/another/path?', '/another/path?name=value',
'/yet/one/path?name=value&more']
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
bad = ['/some/path'] bad = ['/some/path']
class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
# normalize the URL first (#17403)
robots_txt = """\ robots_txt = """\
User-agent: * User-agent: *
Allow: /some/path? Disallow: /a1/Z-._~ # unreserved characters
Disallow: /another/path? Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
Disallow: /u2/%f0%9f%90%8d
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
Disallow: /v2/%f0
Disallow: /v3/\udcf0 # raw non-ASCII octet
Disallow: /p1%xy # raw percent
Disallow: /p2%
Disallow: /p3%25xy # percent-encoded percent
Disallow: /p4%2525xy # double percent-encoded percent
Disallow: /john%20smith # space
Disallow: /john doe
Disallow: /trailingspace%20
Disallow: /question%3Fq=v # not query
Disallow: /hash%23f # not fragment
Disallow: /dollar%24
Disallow: /asterisk%2A
Disallow: /sub/dir
Disallow: /slash%2F
Disallow: /query/question?q=%3F
Disallow: /query/raw/question?q=?
Disallow: /query/eq?q%3Dv
Disallow: /query/amp?q=v%26a
""" """
good = ['/some/path?'] good = [
bad = ['/another/path?'] '/u1/%F0', '/u1/%f0',
'/u2/%F0', '/u2/%f0',
'/u3/%F0', '/u3/%f0',
'/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
'/question?q=v',
'/dollar', '/asterisk',
'/query/eq?q=v',
'/query/amp?q=v&a',
]
bad = [
'/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
'/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
'/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
'/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
'/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
'/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
'/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
'/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
'/p1%xy', '/p1%25xy',
'/p2%', '/p2%25', '/p2%2525', '/p2%xy',
'/p3%xy', '/p3%25xy',
'/p4%2525xy',
'/john%20smith', '/john smith',
'/john%20doe', '/john doe',
'/trailingspace%20', '/trailingspace ',
'/question%3Fq=v',
'/hash#f', '/hash%23f',
'/dollar$', '/dollar%24',
'/asterisk*', '/asterisk%2A',
'/sub/dir', '/sub%2Fdir',
'/slash%2F', '/slash/',
'/query/question?q=?', '/query/question?q=%3F',
'/query/raw/question?q=?', '/query/raw/question?q=%3F',
'/query/eq?q%3Dv',
'/query/amp?q=v%26a',
]
# other reserved characters
for c in ":/#[]@!$&'()*+,;=":
robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
bad.append(f'/raw{c}')
bad.append(f'/raw%{ord(c):02X}')
bad.append(f'/pc{c}')
bad.append(f'/pc%{ord(c):02X}')
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@ -299,26 +378,17 @@ def test_string_formatting(self):
self.assertEqual(str(self.parser), self.expected_output) self.assertEqual(str(self.parser), self.expected_output)
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.send_error(403, "Forbidden access")
def log_message(self, format, *args):
pass
@unittest.skipUnless( @unittest.skipUnless(
support.has_socket_support, support.has_socket_support,
"Socket server requires working socket." "Socket server requires working socket."
) )
class PasswordProtectedSiteTestCase(unittest.TestCase): class BaseLocalNetworkTestCase:
def setUp(self): def setUp(self):
# clear _opener global variable # clear _opener global variable
self.addCleanup(urllib.request.urlcleanup) self.addCleanup(urllib.request.urlcleanup)
self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler) self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
self.t = threading.Thread( self.t = threading.Thread(
name='HTTPServer serving', name='HTTPServer serving',
@ -335,6 +405,57 @@ def tearDown(self):
self.t.join() self.t.join()
self.server.server_close() self.server.server_close()
SAMPLE_ROBOTS_TXT = b'''\
User-agent: test_robotparser
Disallow: /utf8/\xf0\x9f\x90\x8d
Disallow: /non-utf8/\xf0
Disallow: //[spam]/path
'''
class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.end_headers()
self.wfile.write(SAMPLE_ROBOTS_TXT)
def log_message(self, format, *args):
pass
@threading_helper.reap_threads
def testRead(self):
# Test that reading a weird robots.txt doesn't fail.
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + '/robots.txt'
parser = urllib.robotparser.RobotFileParser()
parser.set_url(robots_url)
parser.read()
# And it can even interpret the weird paths in some reasonable way.
agent = 'test_robotparser'
self.assertTrue(parser.can_fetch(agent, robots_url))
self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.send_error(403, "Forbidden access")
def log_message(self, format, *args):
pass
@threading_helper.reap_threads @threading_helper.reap_threads
def testPasswordProtectedSite(self): def testPasswordProtectedSite(self):
addr = self.server.server_address addr = self.server.server_address

View file

@ -11,6 +11,7 @@
""" """
import collections import collections
import re
import urllib.error import urllib.error
import urllib.parse import urllib.parse
import urllib.request import urllib.request
@ -20,6 +21,19 @@
RequestRate = collections.namedtuple("RequestRate", "requests seconds") RequestRate = collections.namedtuple("RequestRate", "requests seconds")
def normalize(path):
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
return urllib.parse.quote(unquoted, errors='surrogateescape')
def normalize_path(path):
path, sep, query = path.partition('?')
path = normalize(path)
if sep:
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
path += '?' + query
return path
class RobotFileParser: class RobotFileParser:
""" This class provides a set of methods to read, parse and answer """ This class provides a set of methods to read, parse and answer
questions about a single robots.txt file. questions about a single robots.txt file.
@ -55,7 +69,7 @@ def modified(self):
def set_url(self, url): def set_url(self, url):
"""Sets the URL referring to a robots.txt file.""" """Sets the URL referring to a robots.txt file."""
self.url = url self.url = url
self.host, self.path = urllib.parse.urlparse(url)[1:3] self.host, self.path = urllib.parse.urlsplit(url)[1:3]
def read(self): def read(self):
"""Reads the robots.txt URL and feeds it to the parser.""" """Reads the robots.txt URL and feeds it to the parser."""
@ -69,7 +83,7 @@ def read(self):
err.close() err.close()
else: else:
raw = f.read() raw = f.read()
self.parse(raw.decode("utf-8").splitlines()) self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
def _add_entry(self, entry): def _add_entry(self, entry):
if "*" in entry.useragents: if "*" in entry.useragents:
@ -113,7 +127,7 @@ def parse(self, lines):
line = line.split(':', 1) line = line.split(':', 1)
if len(line) == 2: if len(line) == 2:
line[0] = line[0].strip().lower() line[0] = line[0].strip().lower()
line[1] = urllib.parse.unquote(line[1].strip()) line[1] = line[1].strip()
if line[0] == "user-agent": if line[0] == "user-agent":
if state == 2: if state == 2:
self._add_entry(entry) self._add_entry(entry)
@ -167,10 +181,11 @@ def can_fetch(self, useragent, url):
return False return False
# search for given user agent matches # search for given user agent matches
# the first match counts # the first match counts
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) # TODO: The private API is used in order to preserve an empty query.
url = urllib.parse.urlunparse(('','',parsed_url.path, # This is temporary until the public API starts supporting this feature.
parsed_url.params,parsed_url.query, parsed_url.fragment)) parsed_url = urllib.parse._urlsplit(url, '')
url = urllib.parse.quote(url) url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
url = normalize_path(url)
if not url: if not url:
url = "/" url = "/"
for entry in self.entries: for entry in self.entries:
@ -213,7 +228,6 @@ def __str__(self):
entries = entries + [self.default_entry] entries = entries + [self.default_entry]
return '\n\n'.join(map(str, entries)) return '\n\n'.join(map(str, entries))
class RuleLine: class RuleLine:
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:" """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
(allowance==False) followed by a path.""" (allowance==False) followed by a path."""
@ -221,8 +235,7 @@ def __init__(self, path, allowance):
if path == '' and not allowance: if path == '' and not allowance:
# an empty value means allow all # an empty value means allow all
allowance = True allowance = True
path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) self.path = normalize_path(path)
self.path = urllib.parse.quote(path)
self.allowance = allowance self.allowance = allowance
def applies_to(self, filename): def applies_to(self, filename):
@ -268,7 +281,7 @@ def applies_to(self, useragent):
def allowance(self, filename): def allowance(self, filename):
"""Preconditions: """Preconditions:
- our agent applies to this entry - our agent applies to this entry
- filename is URL decoded""" - filename is URL encoded"""
for line in self.rulelines: for line in self.rulelines:
if line.applies_to(filename): if line.applies_to(filename):
return line.allowance return line.allowance

View file

@ -0,0 +1,3 @@
Fix parsing errors in the :mod:`urllib.robotparser` module.
Don't fail trying to parse weird paths.
Don't fail trying to decode non-UTF-8 ``robots.txt`` files.

View file

@ -0,0 +1,4 @@
Fix normalization of the ``robots.txt`` rules and URLs in the
:mod:`urllib.robotparser` module. No longer ignore trailing ``?``.
Distinguish raw special characters ``?``, ``=`` and ``&`` from the
percent-encoded ones.