Bug 3347: robotparser failed because it didn't convert bytes to string.

The solution is to convert bytes to text via utf-8.  I'm not entirely
sure if this is safe, but it looks like robots.txt is expected to be
ascii.
This commit is contained in:
Jeremy Hylton 2008-07-18 20:59:44 +00:00
parent 48577d1944
commit 73fd46d24e
2 changed files with 18 additions and 5 deletions

View file

@ -60,7 +60,8 @@ def read(self):
elif err.code >= 400:
self.allow_all = True
else:
self.parse(f.read().splitlines())
raw = f.read()
self.parse(raw.decode("utf-8").splitlines())
def _add_entry(self, entry):
if "*" in entry.useragents:
@ -123,7 +124,10 @@ def can_fetch(self, useragent, url):
return True
# search for given user agent matches
# the first match counts
url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
url = urllib.parse.quote(
urllib.parse.urlparse(urllib.parse.unquote(url))[2])
if not url:
url = "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)