mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
[3.9] gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser (GH-137837) (GH-140842) (GH-140857)
(cherry picked from commita17c57eee5) (cherry picked from commit0329bd11c7) Co-authored-by: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
This commit is contained in:
parent
fde6ac1290
commit
941811fc9d
4 changed files with 163 additions and 114 deletions
|
|
@ -109,16 +109,24 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
argument.
|
||||
"""
|
||||
|
||||
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
||||
# See the HTML5 specs section "13.4 Parsing HTML fragments".
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
|
||||
# CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
|
||||
CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
|
||||
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
|
||||
|
||||
def __init__(self, *, convert_charrefs=True):
|
||||
def __init__(self, *, convert_charrefs=True, scripting=False):
|
||||
"""Initialize and reset this instance.
|
||||
|
||||
If convert_charrefs is True (the default), all character references
|
||||
If convert_charrefs is true (the default), all character references
|
||||
are automatically converted to the corresponding Unicode characters.
|
||||
|
||||
If *scripting* is false (the default), the content of the
|
||||
``noscript`` element is parsed normally; if it's true,
|
||||
it's returned as is without being parsed.
|
||||
"""
|
||||
self.convert_charrefs = convert_charrefs
|
||||
self.scripting = scripting
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
|
|
@ -153,7 +161,9 @@ def get_starttag_text(self):
|
|||
def set_cdata_mode(self, elem, *, escapable=False):
|
||||
self.cdata_elem = elem.lower()
|
||||
self._escapable = escapable
|
||||
if escapable and not self.convert_charrefs:
|
||||
if self.cdata_elem == 'plaintext':
|
||||
self.interesting = re.compile(r'\Z')
|
||||
elif escapable and not self.convert_charrefs:
|
||||
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
||||
re.IGNORECASE|re.ASCII)
|
||||
else:
|
||||
|
|
@ -441,8 +451,10 @@ def parse_starttag(self, i):
|
|||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
if (tag in self.CDATA_CONTENT_ELEMENTS or
|
||||
(self.scripting and tag == "noscript") or
|
||||
tag == "plaintext"):
|
||||
self.set_cdata_mode(tag, escapable=False)
|
||||
elif tag in self.RCDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag, escapable=True)
|
||||
return endpos
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue