gh-140875: Fix handling of unclosed charrefs before EOF in HTMLParser (GH-140904)

This commit is contained in:
Serhiy Storchaka 2025-11-19 13:55:10 +02:00 committed by GitHub
parent afa0badcc5
commit 95296a9d40
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 109 additions and 33 deletions

View file

@ -24,6 +24,7 @@
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])')
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
@ -304,10 +305,20 @@ def goahead(self, end):
k = k - 1
i = self.updatepos(i, k)
continue
match = incomplete_charref.match(rawdata, i)
if match:
if end:
self.handle_charref(rawdata[i+2:])
i = self.updatepos(i, n)
break
# incomplete
break
elif i + 3 < n: # larger than "&#x"
# not the end of the buffer, and can't be confused
# with some other construct
self.handle_data("&#")
i = self.updatepos(i, i + 2)
else:
if ";" in rawdata[i:]: # bail by consuming &#
self.handle_data(rawdata[i:i+2])
i = self.updatepos(i, i+2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
@ -321,15 +332,13 @@ def goahead(self, end):
continue
match = incomplete.match(rawdata, i)
if match:
# match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]:
k = match.end()
if k <= i:
k = n
i = self.updatepos(i, i + 1)
if end:
self.handle_entityref(rawdata[i+1:])
i = self.updatepos(i, n)
break
# incomplete
break
elif (i + 1) < n:
elif i + 1 < n:
# not the end of the buffer, and can't be confused
# with some other construct
self.handle_data("&")