gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502)

* Don't fail trying to parse weird patterns. * Don't fail trying to decode non-UTF-8 "robots.txt" files. * No longer ignore trailing "?" in patterns and URLs. * Distinguish raw special characters "?", "=" and "&" from the percent-encoded ones. * Remove tests that do nothing.
2025-10-20 08:23:47 +00:00 · 2025-09-05 18:58:42 +03:00 · 2025-09-05 18:58:42 +03:00 · cb7ef18d70
commit cb7ef18d70
parent ed522ed211
4 changed files with 172 additions and 31 deletions
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@ -11,6 +11,7 @@
 """

 import collections
+import re
 import urllib.error
 import urllib.parse
 import urllib.request
@ -20,6 +21,19 @@
 RequestRate = collections.namedtuple("RequestRate", "requests seconds")


+def normalize(path):
+    unquoted = urllib.parse.unquote(path, errors='surrogateescape')
+    return urllib.parse.quote(unquoted, errors='surrogateescape')
+
+def normalize_path(path):
+    path, sep, query = path.partition('?')
+    path = normalize(path)
+    if sep:
+        query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
+        path += '?' + query
+    return path
+
+
 class RobotFileParser:
    """ This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.
@ -55,7 +69,7 @@ def modified(self):
    def set_url(self, url):
        """Sets the URL referring to a robots.txt file."""
        self.url = url
-        self.host, self.path = urllib.parse.urlparse(url)[1:3]
+        self.host, self.path = urllib.parse.urlsplit(url)[1:3]

    def read(self):
        """Reads the robots.txt URL and feeds it to the parser."""
@ -69,7 +83,7 @@ def read(self):
            err.close()
        else:
            raw = f.read()
-            self.parse(raw.decode("utf-8").splitlines())
+            self.parse(raw.decode("utf-8", "surrogateescape").splitlines())

    def _add_entry(self, entry):
        if "*" in entry.useragents:
@ -113,7 +127,7 @@ def parse(self, lines):
            line = line.split(':', 1)
            if len(line) == 2:
                line[0] = line[0].strip().lower()
-                line[1] = urllib.parse.unquote(line[1].strip())
+                line[1] = line[1].strip()
                if line[0] == "user-agent":
                    if state == 2:
                        self._add_entry(entry)
@ -167,10 +181,11 @@ def can_fetch(self, useragent, url):
            return False
        # search for given user agent matches
        # the first match counts
-        parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
-        url = urllib.parse.urlunparse(('','',parsed_url.path,
-            parsed_url.params,parsed_url.query, parsed_url.fragment))
-        url = urllib.parse.quote(url)
+        # TODO: The private API is used in order to preserve an empty query.
+        # This is temporary until the public API starts supporting this feature.
+        parsed_url = urllib.parse._urlsplit(url, '')
+        url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
+        url = normalize_path(url)
        if not url:
            url = "/"
        for entry in self.entries:
@ -213,7 +228,6 @@ def __str__(self):
            entries = entries + [self.default_entry]
        return '\n\n'.join(map(str, entries))

-
 class RuleLine:
    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path."""
@ -221,8 +235,7 @@ def __init__(self, path, allowance):
        if path == '' and not allowance:
            # an empty value means allow all
            allowance = True
-        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
-        self.path = urllib.parse.quote(path)
+        self.path = normalize_path(path)
        self.allowance = allowance

    def applies_to(self, filename):
@ -268,7 +281,7 @@ def applies_to(self, useragent):
    def allowance(self, filename):
        """Preconditions:
        - our agent applies to this entry
-        - filename is URL decoded"""
+        - filename is URL encoded"""
        for line in self.rulelines:
            if line.applies_to(filename):
                return line.allowance