gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502)

* Don't fail trying to parse weird patterns. * Don't fail trying to decode non-UTF-8 "robots.txt" files. * No longer ignore trailing "?" in patterns and URLs. * Distinguish raw special characters "?", "=" and "&" from the percent-encoded ones. * Remove tests that do nothing.
2025-12-08 06:10:17 +00:00 · 2025-09-05 18:58:42 +03:00 · 2025-09-05 18:58:42 +03:00 · cb7ef18d70
commit cb7ef18d70
parent ed522ed211
4 changed files with 172 additions and 31 deletions
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@ -16,6 +16,14 @@ class BaseRobotTest:
    bad = []
    site_maps = None

+    def __init_subclass__(cls):
+        super().__init_subclass__()
+        # Remove tests that do nothing.
+        if not cls.good:
+            cls.test_good_urls = None
+        if not cls.bad:
+            cls.test_bad_urls = None
+
    def setUp(self):
        lines = io.StringIO(self.robots_txt).readlines()
        self.parser = urllib.robotparser.RobotFileParser()
@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
 User-agent: *
 Disallow: /some/path?name=value
+Disallow: /another/path?
+Disallow: /yet/one/path?name=value&more
    """
-    good = ['/some/path']
-    bad = ['/some/path?name=value']
+    good = ['/some/path', '/some/path?',
+            '/some/path%3Fname=value', '/some/path?name%3Dvalue',
+            '/another/path', '/another/path%3F',
+            '/yet/one/path?name=value%26more']
+    bad = ['/some/path?name=value'
+           '/another/path?', '/another/path?name=value',
+           '/yet/one/path?name=value&more']


 class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
    bad = ['/some/path']


-class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
-    # normalize the URL first (#17403)
+class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
 User-agent: *
-Allow: /some/path?
-Disallow: /another/path?
-    """
-    good = ['/some/path?']
-    bad = ['/another/path?']
+Disallow: /a1/Z-._~ # unreserved characters
+Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
+Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
+Disallow: /u2/%f0%9f%90%8d
+Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
+Disallow: /v1/%F0 # percent-encoded non-ASCII octet
+Disallow: /v2/%f0
+Disallow: /v3/\udcf0 # raw non-ASCII octet
+Disallow: /p1%xy # raw percent
+Disallow: /p2%
+Disallow: /p3%25xy # percent-encoded percent
+Disallow: /p4%2525xy # double percent-encoded percent
+Disallow: /john%20smith # space
+Disallow: /john doe
+Disallow: /trailingspace%20
+Disallow: /question%3Fq=v # not query
+Disallow: /hash%23f # not fragment
+Disallow: /dollar%24
+Disallow: /asterisk%2A
+Disallow: /sub/dir
+Disallow: /slash%2F
+Disallow: /query/question?q=%3F
+Disallow: /query/raw/question?q=?
+Disallow: /query/eq?q%3Dv
+Disallow: /query/amp?q=v%26a
+"""
+    good = [
+        '/u1/%F0', '/u1/%f0',
+        '/u2/%F0', '/u2/%f0',
+        '/u3/%F0', '/u3/%f0',
+        '/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
+        '/question?q=v',
+        '/dollar', '/asterisk',
+        '/query/eq?q=v',
+        '/query/amp?q=v&a',
+    ]
+    bad = [
+        '/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
+        '/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
+        '/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
+        '/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
+        '/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
+        '/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
+        '/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
+        '/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
+        '/p1%xy', '/p1%25xy',
+        '/p2%', '/p2%25', '/p2%2525', '/p2%xy',
+        '/p3%xy', '/p3%25xy',
+        '/p4%2525xy',
+        '/john%20smith', '/john smith',
+        '/john%20doe', '/john doe',
+        '/trailingspace%20', '/trailingspace ',
+        '/question%3Fq=v',
+        '/hash#f', '/hash%23f',
+        '/dollar$', '/dollar%24',
+        '/asterisk*', '/asterisk%2A',
+        '/sub/dir', '/sub%2Fdir',
+        '/slash%2F', '/slash/',
+        '/query/question?q=?', '/query/question?q=%3F',
+        '/query/raw/question?q=?', '/query/raw/question?q=%3F',
+        '/query/eq?q%3Dv',
+        '/query/amp?q=v%26a',
+    ]
+    # other reserved characters
+    for c in ":/#[]@!$&'()*+,;=":
+        robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
+        bad.append(f'/raw{c}')
+        bad.append(f'/raw%{ord(c):02X}')
+        bad.append(f'/pc{c}')
+        bad.append(f'/pc%{ord(c):02X}')


 class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@ -299,26 +378,17 @@ def test_string_formatting(self):
        self.assertEqual(str(self.parser), self.expected_output)


-class RobotHandler(BaseHTTPRequestHandler):
-
-    def do_GET(self):
-        self.send_error(403, "Forbidden access")
-
-    def log_message(self, format, *args):
-        pass
-
-
@unittest.skipUnless(
    support.has_socket_support,
    "Socket server requires working socket."
 )
-class PasswordProtectedSiteTestCase(unittest.TestCase):
+class BaseLocalNetworkTestCase:

    def setUp(self):
        # clear _opener global variable
        self.addCleanup(urllib.request.urlcleanup)

-        self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
+        self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)

        self.t = threading.Thread(
            name='HTTPServer serving',
@ -335,6 +405,57 @@ def tearDown(self):
        self.t.join()
        self.server.server_close()

+
+SAMPLE_ROBOTS_TXT = b'''\
+User-agent: test_robotparser
+Disallow: /utf8/\xf0\x9f\x90\x8d
+Disallow: /non-utf8/\xf0
+Disallow: //[spam]/path
+'''
+
+
+class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
+    class RobotHandler(BaseHTTPRequestHandler):
+
+        def do_GET(self):
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(SAMPLE_ROBOTS_TXT)
+
+        def log_message(self, format, *args):
+            pass
+
+    @threading_helper.reap_threads
+    def testRead(self):
+        # Test that reading a weird robots.txt doesn't fail.
+        addr = self.server.server_address
+        url = f'http://{socket_helper.HOST}:{addr[1]}'
+        robots_url = url + '/robots.txt'
+        parser = urllib.robotparser.RobotFileParser()
+        parser.set_url(robots_url)
+        parser.read()
+        # And it can even interpret the weird paths in some reasonable way.
+        agent = 'test_robotparser'
+        self.assertTrue(parser.can_fetch(agent, robots_url))
+        self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
+        self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
+        self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
+        self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
+        self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
+        self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
+        self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
+        self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
+
+
+class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
+    class RobotHandler(BaseHTTPRequestHandler):
+
+        def do_GET(self):
+            self.send_error(403, "Forbidden access")
+
+        def log_message(self, format, *args):
+            pass
+
    @threading_helper.reap_threads
    def testPasswordProtectedSite(self):
        addr = self.server.server_address