gh-149489: Fix ElementTree serialization to HTML (GH-149490)

* The content of comments, processing instructions and elements "xmp",
  "iframe", "noembed", "noframes", and "plaintext" is no longer escaped.
* The "plaintext" element no longer have the closing tag.
* Add support of empty attributes (with value None).
This commit is contained in:
Serhiy Storchaka 2026-05-30 00:04:50 +03:00 committed by GitHub
parent f87d9605d3
commit bcd29e466f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 58 additions and 11 deletions

View file

@ -1287,7 +1287,15 @@ def check(p, expected, namespaces=None):
{'': 'http://www.w3.org/2001/XMLSchema',
'ns': 'http://www.w3.org/2001/XMLSchema'})
def test_processinginstruction(self):
def test_comment_serialization(self):
comm = ET.Comment('<spam> & ham')
# comments are not escaped
self.assertEqual(ET.tostring(comm), b'<!--<spam> & ham-->')
self.assertEqual(ET.tostring(comm, method='html'), b'<!--<spam> & ham-->')
# no comments in text serialization
self.assertEqual(ET.tostring(comm, method='text'), b'')
def test_processinginstruction_serialization(self):
# Test ProcessingInstruction directly
self.assertEqual(ET.tostring(ET.ProcessingInstruction('test', 'instruction')),
@ -1296,12 +1304,32 @@ def test_processinginstruction(self):
b'<?test instruction?>')
# Issue #2746
# processing instructions are not escaped
self.assertEqual(ET.tostring(ET.PI('test', '<testing&>')),
b'<?test <testing&>?>')
self.assertEqual(ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin-1'),
b"<?xml version='1.0' encoding='latin-1'?>\n"
b"<?test <testing&>\xe3?>")
pi = ET.PI('test', 'ham & eggs < spam')
self.assertEqual(ET.tostring(pi), b'<?test ham & eggs < spam?>')
self.assertEqual(ET.tostring(pi, method='html'), b'<?test ham & eggs < spam?>')
# no processing instructions in text serialization
self.assertEqual(ET.tostring(pi, method='text'), b'')
def test_empty_attribute_serialization(self):
# empty attrs only work in html
elem = ET.Element('tag', attrib={'attr': None})
self.assertRaises(TypeError, ET.tostring, elem)
self.assertEqual(ET.tostring(elem, method='html'), b'<tag attr></tag>')
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
def test_html_cdata_elems_serialization(self, tag):
# content of raw text elements is not escaped in html
tag = tag.title()
elem = ET.Element(tag)
elem.text = '<spam>&ham'
self.assertEqual(ET.tostring(elem, method='html'),
('<%s><spam>&ham</%s>' % (tag, tag)).encode())
def test_html_empty_elems_serialization(self):
# issue 15970
@ -1317,6 +1345,14 @@ def test_html_empty_elems_serialization(self):
method='html')
self.assertEqual(serialized, expected)
def test_html_plaintext_serialization(self):
# content of plaintext is not escaped in html
# no end tag for plaintext
elem = ET.Element('PlainText')
elem.text = '<spam>&ham'
self.assertEqual(ET.tostring(elem, method='html'),
b'<PlainText><spam>&ham')
def test_dump_attribute_order(self):
# See BPO 34160
e = ET.Element('cirriculum', status='public', company='example')

View file

@ -917,17 +917,20 @@ def _serialize_xml(write, elem, qnames, namespaces,
if elem.tail:
write(_escape_cdata(elem.tail))
_CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed",
"noframes", "plaintext"}
HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr",
"img", "input", "isindex", "link", "meta", "param", "source",
"track", "wbr"}
"track", "wbr", "plaintext"}
def _serialize_html(write, elem, qnames, namespaces, **kwargs):
tag = elem.tag
text = elem.text
if tag is Comment:
write("<!--%s-->" % _escape_cdata(text))
write("<!--%s-->" % text)
elif tag is ProcessingInstruction:
write("<?%s?>" % _escape_cdata(text))
write("<?%s?>" % text)
else:
tag = qnames[tag]
if tag is None:
@ -951,16 +954,19 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs):
for k, v in items:
if isinstance(k, QName):
k = k.text
if isinstance(v, QName):
v = qnames[v.text]
k = qnames[k]
if v is None:
write(" %s" % k) # empty attr
else:
v = _escape_attrib_html(v)
# FIXME: handle boolean attributes
write(" %s=\"%s\"" % (qnames[k], v))
if isinstance(v, QName):
v = qnames[v.text]
else:
v = _escape_attrib_html(v)
write(" %s=\"%s\"" % (k, v))
write(">")
ltag = tag.lower()
if text:
if ltag == "script" or ltag == "style":
if ltag in _CDATA_CONTENT_ELEMENTS:
write(text)
else:
write(_escape_cdata(text))

View file

@ -0,0 +1,5 @@
Fix :mod:`~xml.etree.ElementTree` serialization to HTML. The content of
comments, processing instructions and elements "xmp", "iframe", "noembed",
"noframes", and "plaintext" is no longer escaped. The "plaintext" element no
longer have the closing tag. Add support of empty attributes (with value
``None``).