gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502)

* Don't fail trying to parse weird patterns.
* Don't fail trying to decode non-UTF-8 "robots.txt" files.
* No longer ignore trailing "?" in patterns and URLs.
* Distinguish raw special characters "?", "=" and "&" from the
  percent-encoded ones.
* Remove tests that do nothing.
This commit is contained in:
Serhiy Storchaka 2025-09-05 18:58:42 +03:00 committed by GitHub
parent ed522ed211
commit cb7ef18d70
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 172 additions and 31 deletions

View file

@ -11,6 +11,7 @@
"""
import collections
import re
import urllib.error
import urllib.parse
import urllib.request
@ -20,6 +21,19 @@
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
def normalize(path):
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
return urllib.parse.quote(unquoted, errors='surrogateescape')
def normalize_path(path):
path, sep, query = path.partition('?')
path = normalize(path)
if sep:
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
path += '?' + query
return path
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
@ -55,7 +69,7 @@ def modified(self):
def set_url(self, url):
"""Sets the URL referring to a robots.txt file."""
self.url = url
self.host, self.path = urllib.parse.urlparse(url)[1:3]
self.host, self.path = urllib.parse.urlsplit(url)[1:3]
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
@ -69,7 +83,7 @@ def read(self):
err.close()
else:
raw = f.read()
self.parse(raw.decode("utf-8").splitlines())
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
def _add_entry(self, entry):
if "*" in entry.useragents:
@ -113,7 +127,7 @@ def parse(self, lines):
line = line.split(':', 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
line[1] = urllib.parse.unquote(line[1].strip())
line[1] = line[1].strip()
if line[0] == "user-agent":
if state == 2:
self._add_entry(entry)
@ -167,10 +181,11 @@ def can_fetch(self, useragent, url):
return False
# search for given user agent matches
# the first match counts
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
url = urllib.parse.urlunparse(('','',parsed_url.path,
parsed_url.params,parsed_url.query, parsed_url.fragment))
url = urllib.parse.quote(url)
# TODO: The private API is used in order to preserve an empty query.
# This is temporary until the public API starts supporting this feature.
parsed_url = urllib.parse._urlsplit(url, '')
url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
url = normalize_path(url)
if not url:
url = "/"
for entry in self.entries:
@ -213,7 +228,6 @@ def __str__(self):
entries = entries + [self.default_entry]
return '\n\n'.join(map(str, entries))
class RuleLine:
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
(allowance==False) followed by a path."""
@ -221,8 +235,7 @@ def __init__(self, path, allowance):
if path == '' and not allowance:
# an empty value means allow all
allowance = True
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
self.path = urllib.parse.quote(path)
self.path = normalize_path(path)
self.allowance = allowance
def applies_to(self, filename):
@ -268,7 +281,7 @@ def applies_to(self, useragent):
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- filename is URL decoded"""
- filename is URL encoded"""
for line in self.rulelines:
if line.applies_to(filename):
return line.allowance