SF bug #1504333: sgmlib should allow angle brackets in quoted values

(modified patch by Sam Ruby; changed to use separate REs for start and end tags to reduce matching cost for end tags; extended tests; updated to avoid breaking previous changes to support IPv6 addresses in unquoted attribute values)
2025-10-31 13:41:24 +00:00 · 2006-06-29 00:51:53 +00:00 · 2006-06-29 00:51:53 +00:00 · a136210a9f
commit a136210a9f
parent 960a3f88e5
2 changed files with 25 additions and 9 deletions
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@ -29,7 +29,12 @@
 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
 piclose = re.compile('>')
-endbracket = re.compile('[<>]')
+starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
        r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
        r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
        r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
    r')*\s*/?\s*(?=[<>])')
 endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@ -249,14 +254,10 @@ def parse_starttag(self, i):
            self.finish_shorttag(tag, data)
            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
            return k
-        # XXX The following should skip matching quotes (' or ")
+        match = starttag.match(rawdata, i)
        # As a shortcut way to exit, this isn't so bad, but shouldn't
        # be used to locate the actual end of the start tag since the
        # < or > characters may be embedded in an attribute value.
        match = endbracket.search(rawdata, i+1)
        if not match:
            return -1
-        j = match.start(0)
+        j = match.end(0)
        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        if rawdata[i:i+2] == '<>':
@ -305,10 +306,10 @@ def _convert_ref(self, match):
    # Internal -- parse endtag
    def parse_endtag(self, i):
        rawdata = self.rawdata
-        match = endbracket.search(rawdata, i+1)
+        match = endtag.match(rawdata, i)
        if not match:
            return -1
-        j = match.start(0)
+        j = match.end(0)
        tag = rawdata[i+2:j].strip().lower()
        if rawdata[j] == '>':
            j = j+1
--- a/Lib/test/test_sgmllib.py
+++ b/Lib/test/test_sgmllib.py
@ -286,6 +286,21 @@ def test_convert_overrides(self):
            ('codepoint', 'convert', 42),
            ])
    def test_attr_values_quoted_markup(self):
        """Multi-line and markup in attribute values"""
        self.check_events("""<a title='foo\n<br>bar'>text</a>""",
            [("starttag", "a", [("title", "foo\n<br>bar")]),
             ("data", "text"),
             ("endtag", "a")])
        self.check_events("""<a title='less < than'>text</a>""",
            [("starttag", "a", [("title", "less < than")]),
             ("data", "text"),
             ("endtag", "a")])
        self.check_events("""<a title='greater > than'>text</a>""",
            [("starttag", "a", [("title", "greater > than")]),
             ("data", "text"),
             ("endtag", "a")])
    def test_attr_funky_names(self):
        self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
            ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),