mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	SF bug #1504333: sgmlib should allow angle brackets in quoted values
(modified patch by Sam Ruby; changed to use separate REs for start and end tags to reduce matching cost for end tags; extended tests; updated to avoid breaking previous changes to support IPv6 addresses in unquoted attribute values)
This commit is contained in:
		
							parent
							
								
									960a3f88e5
								
							
						
					
					
						commit
						a136210a9f
					
				
					 2 changed files with 25 additions and 9 deletions
				
			
		|  | @ -29,7 +29,12 @@ | ||||||
| shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') | shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') | ||||||
| shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') | shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') | ||||||
| piclose = re.compile('>') | piclose = re.compile('>') | ||||||
| endbracket = re.compile('[<>]') | starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*(' | ||||||
|  |         r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' | ||||||
|  |         r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]' | ||||||
|  |         r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?' | ||||||
|  |     r')*\s*/?\s*(?=[<>])') | ||||||
|  | endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])') | ||||||
| tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') | tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') | ||||||
| attrfind = re.compile( | attrfind = re.compile( | ||||||
|     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' |     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' | ||||||
|  | @ -249,14 +254,10 @@ def parse_starttag(self, i): | ||||||
|             self.finish_shorttag(tag, data) |             self.finish_shorttag(tag, data) | ||||||
|             self.__starttag_text = rawdata[start_pos:match.end(1) + 1] |             self.__starttag_text = rawdata[start_pos:match.end(1) + 1] | ||||||
|             return k |             return k | ||||||
|         # XXX The following should skip matching quotes (' or ") |         match = starttag.match(rawdata, i) | ||||||
|         # As a shortcut way to exit, this isn't so bad, but shouldn't |  | ||||||
|         # be used to locate the actual end of the start tag since the |  | ||||||
|         # < or > characters may be embedded in an attribute value. |  | ||||||
|         match = endbracket.search(rawdata, i+1) |  | ||||||
|         if not match: |         if not match: | ||||||
|             return -1 |             return -1 | ||||||
|         j = match.start(0) |         j = match.end(0) | ||||||
|         # Now parse the data between i+1 and j into a tag and attrs |         # Now parse the data between i+1 and j into a tag and attrs | ||||||
|         attrs = [] |         attrs = [] | ||||||
|         if rawdata[i:i+2] == '<>': |         if rawdata[i:i+2] == '<>': | ||||||
|  | @ -305,10 +306,10 @@ def _convert_ref(self, match): | ||||||
|     # Internal -- parse endtag |     # Internal -- parse endtag | ||||||
|     def parse_endtag(self, i): |     def parse_endtag(self, i): | ||||||
|         rawdata = self.rawdata |         rawdata = self.rawdata | ||||||
|         match = endbracket.search(rawdata, i+1) |         match = endtag.match(rawdata, i) | ||||||
|         if not match: |         if not match: | ||||||
|             return -1 |             return -1 | ||||||
|         j = match.start(0) |         j = match.end(0) | ||||||
|         tag = rawdata[i+2:j].strip().lower() |         tag = rawdata[i+2:j].strip().lower() | ||||||
|         if rawdata[j] == '>': |         if rawdata[j] == '>': | ||||||
|             j = j+1 |             j = j+1 | ||||||
|  |  | ||||||
|  | @ -286,6 +286,21 @@ def test_convert_overrides(self): | ||||||
|             ('codepoint', 'convert', 42), |             ('codepoint', 'convert', 42), | ||||||
|             ]) |             ]) | ||||||
| 
 | 
 | ||||||
|  |     def test_attr_values_quoted_markup(self): | ||||||
|  |         """Multi-line and markup in attribute values""" | ||||||
|  |         self.check_events("""<a title='foo\n<br>bar'>text</a>""", | ||||||
|  |             [("starttag", "a", [("title", "foo\n<br>bar")]), | ||||||
|  |              ("data", "text"), | ||||||
|  |              ("endtag", "a")]) | ||||||
|  |         self.check_events("""<a title='less < than'>text</a>""", | ||||||
|  |             [("starttag", "a", [("title", "less < than")]), | ||||||
|  |              ("data", "text"), | ||||||
|  |              ("endtag", "a")]) | ||||||
|  |         self.check_events("""<a title='greater > than'>text</a>""", | ||||||
|  |             [("starttag", "a", [("title", "greater > than")]), | ||||||
|  |              ("data", "text"), | ||||||
|  |              ("endtag", "a")]) | ||||||
|  | 
 | ||||||
|     def test_attr_funky_names(self): |     def test_attr_funky_names(self): | ||||||
|         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ |         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ | ||||||
|             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), |             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Fred Drake
						Fred Drake