HTMLParser is allowed to be more strict than sgmllib, so let's not

change their basic behavior:  When parsing something that cannot possibly
be valid in either HTML or XHTML, raise an exception.
This commit is contained in:
Fred Drake 2001-09-04 16:26:03 +00:00
parent a0ca3d611e
commit 7cf613dc77
2 changed files with 17 additions and 37 deletions

View file

@ -269,17 +269,18 @@ def parse_declaration(self, i):
return -1 return -1
# in practice, this should look like: ((name|stringlit) S*)+ '>' # in practice, this should look like: ((name|stringlit) S*)+ '>'
n = len(rawdata) n = len(rawdata)
decltype = None decltype, j = self.scan_name(j, i)
extrachars = "" if j < 0:
return j
if decltype.lower() != "doctype":
raise HTMLParseError("unknown declaration: '%s'" % decltype,
self.getpos())
while j < n: while j < n:
c = rawdata[j] c = rawdata[j]
if c == ">": if c == ">":
# end of declaration syntax # end of declaration syntax
data = rawdata[i+2:j] data = rawdata[i+2:j]
if decltype == "doctype":
self.handle_decl(data) self.handle_decl(data)
else:
self.unknown_decl(data)
return j + 1 return j + 1
if c in "\"'": if c in "\"'":
m = declstringlit.match(rawdata, j) m = declstringlit.match(rawdata, j)
@ -287,30 +288,15 @@ def parse_declaration(self, i):
return -1 # incomplete return -1 # incomplete
j = m.end() j = m.end()
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
m = declname.match(rawdata, j) name, j = self.scan_name(j, i)
if not m:
return -1 # incomplete
j = m.end()
if decltype is None:
decltype = m.group(0).rstrip().lower()
if decltype != "doctype":
extrachars = "="
elif c == "[" and decltype == "doctype": elif c == "[" and decltype == "doctype":
j = self.parse_doctype_subset(j + 1, i) j = self.parse_doctype_subset(j + 1, i)
if j < 0:
return j
elif c in extrachars:
j = j + 1
while j < n and rawdata[j] in string.whitespace:
j = j + 1
if j == n:
# end of buffer while in declaration
return -1
else: else:
raise HTMLParseError( raise HTMLParseError(
"unexpected char in declaration: %s" % `rawdata[j]`, "unexpected char in declaration: %s" % `rawdata[j]`,
self.getpos()) self.getpos())
decltype = decltype or '' if j < 0:
return j
return -1 # incomplete return -1 # incomplete
# Internal -- scan past the internal subset in a <!DOCTYPE declaration, # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
@ -359,11 +345,9 @@ def parse_doctype_subset(self, i, declstartpos):
if (j + 1) == n: if (j + 1) == n:
# end of buffer; incomplete # end of buffer; incomplete
return -1 return -1
m = declname.match(rawdata, j + 1) s, j = self.scan_name(j + 1, declstartpos)
s = m.group() if j < 0:
if s == rawdata[j+1:]: return j
return -1
j = j + 1 + len(s.rstrip())
if rawdata[j] == ";": if rawdata[j] == ";":
j = j + 1 j = j + 1
elif c == "]": elif c == "]":
@ -383,7 +367,8 @@ def parse_doctype_subset(self, i, declstartpos):
j = j + 1 j = j + 1
else: else:
self.updatepos(declstartpos, j) self.updatepos(declstartpos, j)
raise HTMLParseError("unexpected char in internal subset", raise HTMLParseError(
"unexpected char %s in internal subset" % `c`,
self.getpos()) self.getpos())
# end of buffer reached # end of buffer reached
return -1 return -1

View file

@ -203,12 +203,7 @@ def test_attr_funky_names(self):
]) ])
def test_illegal_declarations(self): def test_illegal_declarations(self):
s = 'abc<!spacer type="block" height="25">def' self._parse_error('<!spacer type="block" height="25">')
self._run_check(s, [
("data", "abc"),
("unknown decl", 'spacer type="block" height="25"'),
("data", "def"),
])
def test_starttag_end_boundary(self): def test_starttag_end_boundary(self):
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])