mirror of
https://github.com/python/cpython.git
synced 2025-10-20 16:33:53 +00:00
gh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (GH-135930)
* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section. * Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `. * Null character (U+0000) no longer ends the tag name. * Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`. * Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`. * Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar". * Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None. * Fix Sphinx errors. * Apply suggestions from code review Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> * Address review comments. * Move to Security. --------- Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
This commit is contained in:
parent
938a5d7e62
commit
0243f97cba
3 changed files with 192 additions and 127 deletions
|
@ -31,15 +31,43 @@
|
|||
piclose = re.compile('>')
|
||||
commentclose = re.compile(r'--\s*>')
|
||||
# Note:
|
||||
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
|
||||
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
|
||||
# 1) if you change tagfind/attrfind remember to update locatetagend too;
|
||||
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
|
||||
# explode, so don't do it.
|
||||
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
|
||||
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
|
||||
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
|
||||
attrfind_tolerant = re.compile(
|
||||
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
|
||||
# see the HTML5 specs section "13.2.5.6 Tag open state",
|
||||
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
||||
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
|
||||
attrfind_tolerant = re.compile(r"""
|
||||
(
|
||||
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
|
||||
)
|
||||
(= # value indicator
|
||||
('[^']*' # LITA-enclosed value
|
||||
|"[^"]*" # LIT-enclosed value
|
||||
|(?!['"])[^>\t\n\r\f ]* # bare value
|
||||
)
|
||||
)?
|
||||
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
|
||||
""", re.VERBOSE)
|
||||
locatetagend = re.compile(r"""
|
||||
[a-zA-Z][^\t\n\r\f />]* # tag name
|
||||
[\t\n\r\f /]* # optional whitespace before attribute name
|
||||
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
|
||||
(?:= # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|"[^"]*" # LIT-enclosed value
|
||||
|(?!['"])[^>\t\n\r\f ]* # bare value
|
||||
)
|
||||
)?
|
||||
[\t\n\r\f /]* # possibly followed by a space
|
||||
)*
|
||||
>?
|
||||
""", re.VERBOSE)
|
||||
# The following variables are not used, but are temporarily left for
|
||||
# backward compatibility.
|
||||
locatestarttagend_tolerant = re.compile(r"""
|
||||
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
|
||||
(?:[\s/]* # optional whitespace before attribute name
|
||||
|
@ -56,8 +84,6 @@
|
|||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
endendtag = re.compile('>')
|
||||
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
|
||||
# </ and the tag name, so maybe this should be fixed
|
||||
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
|
||||
|
||||
# Character reference processing logic specific to attribute values
|
||||
|
@ -141,7 +167,8 @@ def get_starttag_text(self):
|
|||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
||||
re.IGNORECASE|re.ASCII)
|
||||
|
||||
def clear_cdata_mode(self):
|
||||
self.interesting = interesting_normal
|
||||
|
@ -166,7 +193,7 @@ def goahead(self, end):
|
|||
# & near the end and see if it's followed by a space or ;.
|
||||
amppos = rawdata.rfind('&', max(i, n-34))
|
||||
if (amppos >= 0 and
|
||||
not re.compile(r'[\s;]').search(rawdata, amppos)):
|
||||
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
|
||||
break # wait till we get all the text
|
||||
j = n
|
||||
else:
|
||||
|
@ -310,7 +337,7 @@ def parse_html_declaration(self, i):
|
|||
return self.parse_bogus_comment(i)
|
||||
|
||||
# Internal -- parse bogus comment, return length or -1 if not terminated
|
||||
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
|
||||
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
||||
def parse_bogus_comment(self, i, report=1):
|
||||
rawdata = self.rawdata
|
||||
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
|
||||
|
@ -336,6 +363,8 @@ def parse_pi(self, i):
|
|||
|
||||
# Internal -- handle starttag, return end or -1 if not terminated
|
||||
def parse_starttag(self, i):
|
||||
# See the HTML5 specs section "13.2.5.8 Tag name state"
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
|
@ -381,76 +410,42 @@ def parse_starttag(self, i):
|
|||
# or -1 if incomplete.
|
||||
def check_for_whole_start_tag(self, i):
|
||||
rawdata = self.rawdata
|
||||
m = locatestarttagend_tolerant.match(rawdata, i)
|
||||
if m:
|
||||
j = m.end()
|
||||
next = rawdata[j:j+1]
|
||||
if next == ">":
|
||||
return j + 1
|
||||
if next == "/":
|
||||
if rawdata.startswith("/>", j):
|
||||
return j + 2
|
||||
if rawdata.startswith("/", j):
|
||||
# buffer boundary
|
||||
return -1
|
||||
# else bogus input
|
||||
if j > i:
|
||||
return j
|
||||
else:
|
||||
return i + 1
|
||||
if next == "":
|
||||
# end of input
|
||||
return -1
|
||||
if next in ("abcdefghijklmnopqrstuvwxyz=/"
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
|
||||
# end of input in or before attribute value, or we have the
|
||||
# '/' from a '/>' ending
|
||||
return -1
|
||||
if j > i:
|
||||
return j
|
||||
else:
|
||||
return i + 1
|
||||
raise AssertionError("we should not get here!")
|
||||
match = locatetagend.match(rawdata, i+1)
|
||||
assert match
|
||||
j = match.end()
|
||||
if rawdata[j-1] != ">":
|
||||
return -1
|
||||
return j
|
||||
|
||||
# Internal -- parse endtag, return end or -1 if incomplete
|
||||
def parse_endtag(self, i):
|
||||
# See the HTML5 specs section "13.2.5.7 End tag open state"
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
|
||||
rawdata = self.rawdata
|
||||
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
|
||||
match = endendtag.search(rawdata, i+1) # >
|
||||
if not match:
|
||||
if rawdata.find('>', i+2) < 0: # fast check
|
||||
return -1
|
||||
gtpos = match.end()
|
||||
match = endtagfind.match(rawdata, i) # </ + tag + >
|
||||
if not match:
|
||||
if self.cdata_elem is not None:
|
||||
self.handle_data(rawdata[i:gtpos])
|
||||
return gtpos
|
||||
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
|
||||
namematch = tagfind_tolerant.match(rawdata, i+2)
|
||||
if not namematch:
|
||||
# w3.org/TR/html5/tokenization.html#end-tag-open-state
|
||||
if rawdata[i:i+3] == '</>':
|
||||
return i+3
|
||||
else:
|
||||
return self.parse_bogus_comment(i)
|
||||
tagname = namematch.group(1).lower()
|
||||
# consume and ignore other stuff between the name and the >
|
||||
# Note: this is not 100% correct, since we might have things like
|
||||
# </tag attr=">">, but looking for > after the name should cover
|
||||
# most of the cases and is much simpler
|
||||
gtpos = rawdata.find('>', namematch.end())
|
||||
self.handle_endtag(tagname)
|
||||
return gtpos+1
|
||||
if not endtagopen.match(rawdata, i): # </ + letter
|
||||
if rawdata[i+2:i+3] == '>': # </> is ignored
|
||||
# "missing-end-tag-name" parser error
|
||||
return i+3
|
||||
else:
|
||||
return self.parse_bogus_comment(i)
|
||||
|
||||
elem = match.group(1).lower() # script or style
|
||||
if self.cdata_elem is not None:
|
||||
if elem != self.cdata_elem:
|
||||
self.handle_data(rawdata[i:gtpos])
|
||||
return gtpos
|
||||
match = locatetagend.match(rawdata, i+2)
|
||||
assert match
|
||||
j = match.end()
|
||||
if rawdata[j-1] != ">":
|
||||
return -1
|
||||
|
||||
self.handle_endtag(elem)
|
||||
# find the name: "13.2.5.8 Tag name state"
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
match = tagfind_tolerant.match(rawdata, i+2)
|
||||
assert match
|
||||
tag = match.group(1).lower()
|
||||
self.handle_endtag(tag)
|
||||
self.clear_cdata_mode()
|
||||
return gtpos
|
||||
return j
|
||||
|
||||
# Overridable -- finish processing of start+end tag: <tag.../>
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue