mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 23:21:29 +00:00 
			
		
		
		
	[3.14] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (#137772)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
		
							parent
							
								
									75c2d9f7c4
								
							
						
					
					
						commit
						61f7156965
					
				
					 3 changed files with 90 additions and 27 deletions
				
			
		| 
						 | 
					@ -146,6 +146,7 @@ def reset(self):
 | 
				
			||||||
        self.lasttag = '???'
 | 
					        self.lasttag = '???'
 | 
				
			||||||
        self.interesting = interesting_normal
 | 
					        self.interesting = interesting_normal
 | 
				
			||||||
        self.cdata_elem = None
 | 
					        self.cdata_elem = None
 | 
				
			||||||
 | 
					        self._support_cdata = True
 | 
				
			||||||
        self._escapable = True
 | 
					        self._escapable = True
 | 
				
			||||||
        super().reset()
 | 
					        super().reset()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -183,6 +184,19 @@ def clear_cdata_mode(self):
 | 
				
			||||||
        self.cdata_elem = None
 | 
					        self.cdata_elem = None
 | 
				
			||||||
        self._escapable = True
 | 
					        self._escapable = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _set_support_cdata(self, flag=True):
 | 
				
			||||||
 | 
					        """Enable or disable support of the CDATA sections.
 | 
				
			||||||
 | 
					        If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
 | 
				
			||||||
 | 
					        If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        This method is not called by default. Its purpose is to be called
 | 
				
			||||||
 | 
					        in custom handle_starttag() and handle_endtag() methods, with
 | 
				
			||||||
 | 
					        value that depends on the adjusted current node.
 | 
				
			||||||
 | 
					        See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
 | 
				
			||||||
 | 
					        for details.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        self._support_cdata = flag
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Internal -- handle data as far as reasonable.  May leave state
 | 
					    # Internal -- handle data as far as reasonable.  May leave state
 | 
				
			||||||
    # and data to be processed by a subsequent call.  If 'end' is
 | 
					    # and data to be processed by a subsequent call.  If 'end' is
 | 
				
			||||||
    # true, force handling all data as if followed by EOF marker.
 | 
					    # true, force handling all data as if followed by EOF marker.
 | 
				
			||||||
| 
						 | 
					@ -257,7 +271,7 @@ def goahead(self, end):
 | 
				
			||||||
                                j -= len(suffix)
 | 
					                                j -= len(suffix)
 | 
				
			||||||
                                break
 | 
					                                break
 | 
				
			||||||
                        self.handle_comment(rawdata[i+4:j])
 | 
					                        self.handle_comment(rawdata[i+4:j])
 | 
				
			||||||
                    elif startswith("<![CDATA[", i):
 | 
					                    elif startswith("<![CDATA[", i) and self._support_cdata:
 | 
				
			||||||
                        self.unknown_decl(rawdata[i+3:])
 | 
					                        self.unknown_decl(rawdata[i+3:])
 | 
				
			||||||
                    elif rawdata[i:i+9].lower() == '<!doctype':
 | 
					                    elif rawdata[i:i+9].lower() == '<!doctype':
 | 
				
			||||||
                        self.handle_decl(rawdata[i+2:])
 | 
					                        self.handle_decl(rawdata[i+2:])
 | 
				
			||||||
| 
						 | 
					@ -333,8 +347,12 @@ def parse_html_declaration(self, i):
 | 
				
			||||||
        if rawdata[i:i+4] == '<!--':
 | 
					        if rawdata[i:i+4] == '<!--':
 | 
				
			||||||
            # this case is actually already handled in goahead()
 | 
					            # this case is actually already handled in goahead()
 | 
				
			||||||
            return self.parse_comment(i)
 | 
					            return self.parse_comment(i)
 | 
				
			||||||
        elif rawdata[i:i+9] == '<![CDATA[':
 | 
					        elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
 | 
				
			||||||
            return self.parse_marked_section(i)
 | 
					            j = rawdata.find(']]>', i+9)
 | 
				
			||||||
 | 
					            if j < 0:
 | 
				
			||||||
 | 
					                return -1
 | 
				
			||||||
 | 
					            self.unknown_decl(rawdata[i+3: j])
 | 
				
			||||||
 | 
					            return j + 3
 | 
				
			||||||
        elif rawdata[i:i+9].lower() == '<!doctype':
 | 
					        elif rawdata[i:i+9].lower() == '<!doctype':
 | 
				
			||||||
            # find the closing >
 | 
					            # find the closing >
 | 
				
			||||||
            gtpos = rawdata.find('>', i+9)
 | 
					            gtpos = rawdata.find('>', i+9)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,10 +10,13 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class EventCollector(html.parser.HTMLParser):
 | 
					class EventCollector(html.parser.HTMLParser):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, *args, **kw):
 | 
					    def __init__(self, *args, autocdata=False, **kw):
 | 
				
			||||||
 | 
					        self.autocdata = autocdata
 | 
				
			||||||
        self.events = []
 | 
					        self.events = []
 | 
				
			||||||
        self.append = self.events.append
 | 
					        self.append = self.events.append
 | 
				
			||||||
        html.parser.HTMLParser.__init__(self, *args, **kw)
 | 
					        html.parser.HTMLParser.__init__(self, *args, **kw)
 | 
				
			||||||
 | 
					        if autocdata:
 | 
				
			||||||
 | 
					            self._set_support_cdata(False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_events(self):
 | 
					    def get_events(self):
 | 
				
			||||||
        # Normalize the list of events so that buffer artefacts don't
 | 
					        # Normalize the list of events so that buffer artefacts don't
 | 
				
			||||||
| 
						 | 
					@ -34,12 +37,16 @@ def get_events(self):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle_starttag(self, tag, attrs):
 | 
					    def handle_starttag(self, tag, attrs):
 | 
				
			||||||
        self.append(("starttag", tag, attrs))
 | 
					        self.append(("starttag", tag, attrs))
 | 
				
			||||||
 | 
					        if self.autocdata and tag == 'svg':
 | 
				
			||||||
 | 
					            self._set_support_cdata(True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle_startendtag(self, tag, attrs):
 | 
					    def handle_startendtag(self, tag, attrs):
 | 
				
			||||||
        self.append(("startendtag", tag, attrs))
 | 
					        self.append(("startendtag", tag, attrs))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle_endtag(self, tag):
 | 
					    def handle_endtag(self, tag):
 | 
				
			||||||
        self.append(("endtag", tag))
 | 
					        self.append(("endtag", tag))
 | 
				
			||||||
 | 
					        if self.autocdata and tag == 'svg':
 | 
				
			||||||
 | 
					            self._set_support_cdata(False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # all other markup
 | 
					    # all other markup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -767,10 +774,6 @@ def test_eof_in_declarations(self):
 | 
				
			||||||
            ('<!', [('comment', '')]),
 | 
					            ('<!', [('comment', '')]),
 | 
				
			||||||
            ('<!-', [('comment', '-')]),
 | 
					            ('<!-', [('comment', '-')]),
 | 
				
			||||||
            ('<![', [('comment', '[')]),
 | 
					            ('<![', [('comment', '[')]),
 | 
				
			||||||
            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
 | 
					 | 
				
			||||||
            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
 | 
					 | 
				
			||||||
            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
 | 
					 | 
				
			||||||
            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
 | 
					 | 
				
			||||||
            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
 | 
					            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
 | 
				
			||||||
            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
 | 
					            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
 | 
				
			||||||
            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
 | 
					            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
 | 
				
			||||||
| 
						 | 
					@ -783,6 +786,18 @@ def test_eof_in_declarations(self):
 | 
				
			||||||
        for html, expected in data:
 | 
					        for html, expected in data:
 | 
				
			||||||
            self._run_check(html, expected)
 | 
					            self._run_check(html, expected)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @support.subTests('content', ['', 'x', 'x]', 'x]]'])
 | 
				
			||||||
 | 
					    def test_eof_in_cdata(self, content):
 | 
				
			||||||
 | 
					        self._run_check('<![CDATA[' + content,
 | 
				
			||||||
 | 
					                        [('unknown decl', 'CDATA[' + content)])
 | 
				
			||||||
 | 
					        self._run_check('<![CDATA[' + content,
 | 
				
			||||||
 | 
					                        [('comment', '[CDATA[' + content)],
 | 
				
			||||||
 | 
					                        collector=EventCollector(autocdata=True))
 | 
				
			||||||
 | 
					        self._run_check('<svg><text y="100"><![CDATA[' + content,
 | 
				
			||||||
 | 
					                        [('starttag', 'svg', []),
 | 
				
			||||||
 | 
					                         ('starttag', 'text', [('y', '100')]),
 | 
				
			||||||
 | 
					                         ('unknown decl', 'CDATA[' + content)])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_bogus_comments(self):
 | 
					    def test_bogus_comments(self):
 | 
				
			||||||
        html = ('<!ELEMENT br EMPTY>'
 | 
					        html = ('<!ELEMENT br EMPTY>'
 | 
				
			||||||
                '<! not really a comment >'
 | 
					                '<! not really a comment >'
 | 
				
			||||||
| 
						 | 
					@ -845,28 +860,53 @@ def test_broken_condcoms(self):
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
        self._run_check(html, expected)
 | 
					        self._run_check(html, expected)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_cdata_declarations(self):
 | 
					    @support.subTests('content', [
 | 
				
			||||||
        # More tests should be added. See also "8.2.4.42. Markup
 | 
					        'just some plain text',
 | 
				
			||||||
        # declaration open state", "8.2.4.69. CDATA section state",
 | 
					        '<!-- not a comment -->',
 | 
				
			||||||
        # and issue 32876
 | 
					        '¬-an-entity-ref;',
 | 
				
			||||||
        html = ('<![CDATA[just some plain text]]>')
 | 
					        "<not a='start tag'>",
 | 
				
			||||||
        expected = [('unknown decl', 'CDATA[just some plain text')]
 | 
					        '',
 | 
				
			||||||
        self._run_check(html, expected)
 | 
					        '[[I have many brackets]]',
 | 
				
			||||||
 | 
					        'I have a > in the middle',
 | 
				
			||||||
    def test_cdata_declarations_multiline(self):
 | 
					        'I have a ]] in the middle',
 | 
				
			||||||
        html = ('<code><![CDATA['
 | 
					        '] ]>',
 | 
				
			||||||
                '    if (a < b && a > b) {'
 | 
					        ']] >',
 | 
				
			||||||
                '        printf("[<marquee>How?</marquee>]");'
 | 
					        ('\n'
 | 
				
			||||||
                '    }'
 | 
					         '    if (a < b && a > b) {\n'
 | 
				
			||||||
                ']]></code>')
 | 
					         '        printf("[<marquee>How?</marquee>]");\n'
 | 
				
			||||||
 | 
					         '    }\n'),
 | 
				
			||||||
 | 
					    ])
 | 
				
			||||||
 | 
					    def test_cdata_section_content(self, content):
 | 
				
			||||||
 | 
					        # See "13.2.5.42 Markup declaration open state",
 | 
				
			||||||
 | 
					        # "13.2.5.69 CDATA section state", and issue bpo-32876.
 | 
				
			||||||
 | 
					        html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
 | 
				
			||||||
        expected = [
 | 
					        expected = [
 | 
				
			||||||
            ('starttag', 'code', []),
 | 
					            ('starttag', 'svg', []),
 | 
				
			||||||
            ('unknown decl',
 | 
					            ('starttag', 'text', [('y', '100')]),
 | 
				
			||||||
             'CDATA[    if (a < b && a > b) {        '
 | 
					            ('unknown decl', 'CDATA[' + content),
 | 
				
			||||||
             'printf("[<marquee>How?</marquee>]");    }'),
 | 
					            ('endtag', 'text'),
 | 
				
			||||||
            ('endtag', 'code')
 | 
					            ('endtag', 'svg'),
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
        self._run_check(html, expected)
 | 
					        self._run_check(html, expected)
 | 
				
			||||||
 | 
					        self._run_check(html, expected, collector=EventCollector(autocdata=True))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_cdata_section(self):
 | 
				
			||||||
 | 
					        # See "13.2.5.42 Markup declaration open state".
 | 
				
			||||||
 | 
					        html = ('<![CDATA[foo<br>bar]]>'
 | 
				
			||||||
 | 
					                '<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
 | 
				
			||||||
 | 
					                '<![CDATA[foo<br>bar]]>')
 | 
				
			||||||
 | 
					        expected = [
 | 
				
			||||||
 | 
					            ('comment', '[CDATA[foo<br'),
 | 
				
			||||||
 | 
					            ('data', 'bar]]>'),
 | 
				
			||||||
 | 
					            ('starttag', 'svg', []),
 | 
				
			||||||
 | 
					            ('starttag', 'text', [('y', '100')]),
 | 
				
			||||||
 | 
					            ('unknown decl', 'CDATA[foo<br>bar'),
 | 
				
			||||||
 | 
					            ('endtag', 'text'),
 | 
				
			||||||
 | 
					            ('endtag', 'svg'),
 | 
				
			||||||
 | 
					            ('comment', '[CDATA[foo<br'),
 | 
				
			||||||
 | 
					            ('data', 'bar]]>'),
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					        self._run_check(html, expected, collector=EventCollector(autocdata=True))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_convert_charrefs_dropped_text(self):
 | 
					    def test_convert_charrefs_dropped_text(self):
 | 
				
			||||||
        # #23144: make sure that all the events are triggered when
 | 
					        # #23144: make sure that all the events are triggered when
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,5 @@
 | 
				
			||||||
 | 
					Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
 | 
				
			||||||
 | 
					the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
 | 
				
			||||||
 | 
					Add private method ``_set_support_cdata()`` which can be used to specify
 | 
				
			||||||
 | 
					how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
 | 
				
			||||||
 | 
					(SVG or MathML) or as a bogus comment in the HTML namespace.
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue