gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502)

* Don't fail trying to parse weird patterns. * Don't fail trying to decode non-UTF-8 "robots.txt" files. * No longer ignore trailing "?" in patterns and URLs. * Distinguish raw special characters "?", "=" and "&" from the percent-encoded ones. * Remove tests that do nothing.
2025-11-01 06:01:29 +00:00 · 2025-09-05 18:58:42 +03:00 · 2025-09-05 18:58:42 +03:00 · cb7ef18d70
commit cb7ef18d70
parent ed522ed211
4 changed files with 172 additions and 31 deletions
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@ -16,6 +16,14 @@ class BaseRobotTest:
    bad = []
    site_maps = None
    def __init_subclass__(cls):
        super().__init_subclass__()
        # Remove tests that do nothing.
        if not cls.good:
            cls.test_good_urls = None
        if not cls.bad:
            cls.test_bad_urls = None
    def setUp(self):
        lines = io.StringIO(self.robots_txt).readlines()
        self.parser = urllib.robotparser.RobotFileParser()
@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
 User-agent: *
 Disallow: /some/path?name=value
 Disallow: /another/path?
 Disallow: /yet/one/path?name=value&more
    """
-    good = ['/some/path']
+    good = ['/some/path', '/some/path?',
-    bad = ['/some/path?name=value']
+            '/some/path%3Fname=value', '/some/path?name%3Dvalue',
            '/another/path', '/another/path%3F',
            '/yet/one/path?name=value%26more']
    bad = ['/some/path?name=value'
           '/another/path?', '/another/path?name=value',
           '/yet/one/path?name=value&more']
 class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
    bad = ['/some/path']
-class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
+class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
    # normalize the URL first (#17403)
    robots_txt = """\
 User-agent: *
-Allow: /some/path?
+Disallow: /a1/Z-._~ # unreserved characters
-Disallow: /another/path?
+Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
 Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
 Disallow: /u2/%f0%9f%90%8d
 Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
 Disallow: /v1/%F0 # percent-encoded non-ASCII octet
 Disallow: /v2/%f0
 Disallow: /v3/\udcf0 # raw non-ASCII octet
 Disallow: /p1%xy # raw percent
 Disallow: /p2%
 Disallow: /p3%25xy # percent-encoded percent
 Disallow: /p4%2525xy # double percent-encoded percent
 Disallow: /john%20smith # space
 Disallow: /john doe
 Disallow: /trailingspace%20
 Disallow: /question%3Fq=v # not query
 Disallow: /hash%23f # not fragment
 Disallow: /dollar%24
 Disallow: /asterisk%2A
 Disallow: /sub/dir
 Disallow: /slash%2F
 Disallow: /query/question?q=%3F
 Disallow: /query/raw/question?q=?
 Disallow: /query/eq?q%3Dv
 Disallow: /query/amp?q=v%26a
 """
-    good = ['/some/path?']
+    good = [
-    bad = ['/another/path?']
+        '/u1/%F0', '/u1/%f0',
        '/u2/%F0', '/u2/%f0',
        '/u3/%F0', '/u3/%f0',
        '/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
        '/question?q=v',
        '/dollar', '/asterisk',
        '/query/eq?q=v',
        '/query/amp?q=v&a',
    ]
    bad = [
        '/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
        '/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
        '/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
        '/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
        '/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
        '/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
        '/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
        '/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
        '/p1%xy', '/p1%25xy',
        '/p2%', '/p2%25', '/p2%2525', '/p2%xy',
        '/p3%xy', '/p3%25xy',
        '/p4%2525xy',
        '/john%20smith', '/john smith',
        '/john%20doe', '/john doe',
        '/trailingspace%20', '/trailingspace ',
        '/question%3Fq=v',
        '/hash#f', '/hash%23f',
        '/dollar$', '/dollar%24',
        '/asterisk*', '/asterisk%2A',
        '/sub/dir', '/sub%2Fdir',
        '/slash%2F', '/slash/',
        '/query/question?q=?', '/query/question?q=%3F',
        '/query/raw/question?q=?', '/query/raw/question?q=%3F',
        '/query/eq?q%3Dv',
        '/query/amp?q=v%26a',
    ]
    # other reserved characters
    for c in ":/#[]@!$&'()*+,;=":
        robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
        bad.append(f'/raw{c}')
        bad.append(f'/raw%{ord(c):02X}')
        bad.append(f'/pc{c}')
        bad.append(f'/pc%{ord(c):02X}')
 class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@ -299,26 +378,17 @@ def test_string_formatting(self):
        self.assertEqual(str(self.parser), self.expected_output)
 class RobotHandler(BaseHTTPRequestHandler):
    def do_GET(self):
        self.send_error(403, "Forbidden access")
    def log_message(self, format, *args):
        pass
@unittest.skipUnless(
    support.has_socket_support,
    "Socket server requires working socket."
 )
-class PasswordProtectedSiteTestCase(unittest.TestCase):
+class BaseLocalNetworkTestCase:
    def setUp(self):
        # clear _opener global variable
        self.addCleanup(urllib.request.urlcleanup)
-        self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
+        self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
        self.t = threading.Thread(
            name='HTTPServer serving',
@ -335,6 +405,57 @@ def tearDown(self):
        self.t.join()
        self.server.server_close()
 SAMPLE_ROBOTS_TXT = b'''\
 User-agent: test_robotparser
 Disallow: /utf8/\xf0\x9f\x90\x8d
 Disallow: /non-utf8/\xf0
 Disallow: //[spam]/path
 '''
 class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
    class RobotHandler(BaseHTTPRequestHandler):
        def do_GET(self):
            self.send_response(200)
            self.end_headers()
            self.wfile.write(SAMPLE_ROBOTS_TXT)
        def log_message(self, format, *args):
            pass
    @threading_helper.reap_threads
    def testRead(self):
        # Test that reading a weird robots.txt doesn't fail.
        addr = self.server.server_address
        url = f'http://{socket_helper.HOST}:{addr[1]}'
        robots_url = url + '/robots.txt'
        parser = urllib.robotparser.RobotFileParser()
        parser.set_url(robots_url)
        parser.read()
        # And it can even interpret the weird paths in some reasonable way.
        agent = 'test_robotparser'
        self.assertTrue(parser.can_fetch(agent, robots_url))
        self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
        self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
        self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
        self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
        self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
        self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
        self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
        self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
 class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
    class RobotHandler(BaseHTTPRequestHandler):
        def do_GET(self):
            self.send_error(403, "Forbidden access")
        def log_message(self, format, *args):
            pass
    @threading_helper.reap_threads
    def testPasswordProtectedSite(self):
        addr = self.server.server_address
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@ -11,6 +11,7 @@
 """
 import collections
 import re
 import urllib.error
 import urllib.parse
 import urllib.request
@ -20,6 +21,19 @@
 RequestRate = collections.namedtuple("RequestRate", "requests seconds")
 def normalize(path):
    unquoted = urllib.parse.unquote(path, errors='surrogateescape')
    return urllib.parse.quote(unquoted, errors='surrogateescape')
 def normalize_path(path):
    path, sep, query = path.partition('?')
    path = normalize(path)
    if sep:
        query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
        path += '?' + query
    return path
 class RobotFileParser:
    """ This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.
@ -55,7 +69,7 @@ def modified(self):
    def set_url(self, url):
        """Sets the URL referring to a robots.txt file."""
        self.url = url
-        self.host, self.path = urllib.parse.urlparse(url)[1:3]
+        self.host, self.path = urllib.parse.urlsplit(url)[1:3]
    def read(self):
        """Reads the robots.txt URL and feeds it to the parser."""
@ -69,7 +83,7 @@ def read(self):
            err.close()
        else:
            raw = f.read()
-            self.parse(raw.decode("utf-8").splitlines())
+            self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
    def _add_entry(self, entry):
        if "*" in entry.useragents:
@ -113,7 +127,7 @@ def parse(self, lines):
            line = line.split(':', 1)
            if len(line) == 2:
                line[0] = line[0].strip().lower()
-                line[1] = urllib.parse.unquote(line[1].strip())
+                line[1] = line[1].strip()
                if line[0] == "user-agent":
                    if state == 2:
                        self._add_entry(entry)
@ -167,10 +181,11 @@ def can_fetch(self, useragent, url):
            return False
        # search for given user agent matches
        # the first match counts
-        parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
+        # TODO: The private API is used in order to preserve an empty query.
-        url = urllib.parse.urlunparse(('','',parsed_url.path,
+        # This is temporary until the public API starts supporting this feature.
-            parsed_url.params,parsed_url.query, parsed_url.fragment))
+        parsed_url = urllib.parse._urlsplit(url, '')
-        url = urllib.parse.quote(url)
+        url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
        url = normalize_path(url)
        if not url:
            url = "/"
        for entry in self.entries:
@ -213,7 +228,6 @@ def __str__(self):
            entries = entries + [self.default_entry]
        return '\n\n'.join(map(str, entries))
 class RuleLine:
    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path."""
@ -221,8 +235,7 @@ def __init__(self, path, allowance):
        if path == '' and not allowance:
            # an empty value means allow all
            allowance = True
-        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
+        self.path = normalize_path(path)
        self.path = urllib.parse.quote(path)
        self.allowance = allowance
    def applies_to(self, filename):
@ -268,7 +281,7 @@ def applies_to(self, useragent):
    def allowance(self, filename):
        """Preconditions:
        - our agent applies to this entry
-        - filename is URL decoded"""
+        - filename is URL encoded"""
        for line in self.rulelines:
            if line.applies_to(filename):
                return line.allowance
--- a/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst
+++ b/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst
@ -0,0 +1,3 @@
 Fix parsing errors in the :mod:`urllib.robotparser` module.
 Don't fail trying to parse weird paths.
 Don't fail trying to decode non-UTF-8 ``robots.txt`` files.
--- a/Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst
+++ b/Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst
@ -0,0 +1,4 @@
 Fix normalization of the ``robots.txt`` rules and URLs in the
 :mod:`urllib.robotparser` module. No longer ignore trailing ``?``.
 Distinguish raw special characters ``?``, ``=`` and ``&`` from the
 percent-encoded ones.