| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  | """A parser for HTML and XHTML.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # This file is based on sgmllib.py, but the API is slightly different. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # XXX There should be a way to distinguish between PCDATA (parsed | 
					
						
							|  |  |  | # character data -- the normal case), RCDATA (replaceable character | 
					
						
							|  |  |  | # data -- only char and entity references and end tags are special) | 
					
						
							|  |  |  | # and CDATA (character data -- only end tags are special). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import re | 
					
						
							| 
									
										
										
										
											2012-06-23 15:27:51 +02:00
										 |  |  | import warnings | 
					
						
							| 
									
										
										
										
											2013-11-19 20:28:45 +02:00
										 |  |  | import _markupbase | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from html import unescape | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-05-01 16:09:34 +03:00
										 |  |  | __all__ = ['HTMLParser'] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | # Regular expressions used for parsing | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | interesting_normal = re.compile('[&<]') | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  | incomplete = re.compile('&[a-zA-Z#]') | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  | charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | starttagopen = re.compile('<[a-zA-Z]') | 
					
						
							|  |  |  | piclose = re.compile('>') | 
					
						
							|  |  |  | commentclose = re.compile(r'--\s*>') | 
					
						
							| 
									
										
										
										
											2012-02-21 09:25:00 +02:00
										 |  |  | # Note: | 
					
						
							| 
									
										
										
										
											2014-08-02 14:10:30 +03:00
										 |  |  | #  1) if you change tagfind/attrfind remember to update locatestarttagend too; | 
					
						
							|  |  |  | #  2) if you change tagfind/attrfind and/or locatestarttagend the parser will | 
					
						
							| 
									
										
										
										
											2012-02-21 09:25:00 +02:00
										 |  |  | #     explode, so don't do it. | 
					
						
							| 
									
										
										
										
											2013-11-07 18:33:24 +02:00
										 |  |  | # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state | 
					
						
							|  |  |  | # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state | 
					
						
							|  |  |  | tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  | attrfind_tolerant = re.compile( | 
					
						
							| 
									
										
										
										
											2012-04-18 19:18:22 -06:00
										 |  |  |     r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' | 
					
						
							| 
									
										
										
										
											2012-02-21 09:25:00 +02:00
										 |  |  |     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  | locatestarttagend_tolerant = re.compile(r"""
 | 
					
						
							| 
									
										
										
										
											2013-11-07 18:33:24 +02:00
										 |  |  |   <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name | 
					
						
							| 
									
										
										
										
											2012-02-21 09:25:00 +02:00
										 |  |  |   (?:[\s/]*                          # optional whitespace before attribute name | 
					
						
							|  |  |  |     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name | 
					
						
							| 
									
										
										
										
											2011-11-14 18:53:33 +02:00
										 |  |  |       (?:\s*=+\s*                    # value indicator | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |         (?:'[^']*'                   # LITA-enclosed value | 
					
						
							| 
									
										
										
										
											2011-11-14 18:53:33 +02:00
										 |  |  |           |"[^"]*"                   # LIT-enclosed value | 
					
						
							|  |  |  |           |(?!['"])[^>\s]*           # bare value | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |          ) | 
					
						
							|  |  |  |          (?:\s*,)*                   # possibly followed by a comma | 
					
						
							| 
									
										
										
										
											2012-02-21 09:25:00 +02:00
										 |  |  |        )?(?:\s|/(?!>))* | 
					
						
							| 
									
										
										
										
											2011-11-14 18:53:33 +02:00
										 |  |  |      )* | 
					
						
							|  |  |  |    )? | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |   \s*                                # trailing whitespace | 
					
						
							|  |  |  | """, re.VERBOSE)
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | endendtag = re.compile('>') | 
					
						
							| 
									
										
										
										
											2011-11-01 14:12:22 +02:00
										 |  |  | # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between | 
					
						
							|  |  |  | # </ and the tag name, so maybe this should be fixed | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-02 17:08:24 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-12-07 11:10:11 +00:00
										 |  |  | class HTMLParser(_markupbase.ParserBase): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |     """Find tags and other markup and call handler functions.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Usage: | 
					
						
							|  |  |  |         p = HTMLParser() | 
					
						
							|  |  |  |         p.feed(data) | 
					
						
							|  |  |  |         ... | 
					
						
							|  |  |  |         p.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Start tags are handled by calling self.handle_starttag() or | 
					
						
							|  |  |  |     self.handle_startendtag(); end tags by self.handle_endtag().  The | 
					
						
							|  |  |  |     data between tags is passed from the parser to the derived class | 
					
						
							|  |  |  |     by calling self.handle_data() with the data as argument (the data | 
					
						
							| 
									
										
										
										
											2013-11-23 19:52:05 +02:00
										 |  |  |     may be split up in arbitrary chunks).  If convert_charrefs is | 
					
						
							|  |  |  |     True the character references are converted automatically to the | 
					
						
							|  |  |  |     corresponding Unicode character (and self.handle_data() is no | 
					
						
							|  |  |  |     longer split in chunks), otherwise they are passed by calling | 
					
						
							|  |  |  |     self.handle_entityref() or self.handle_charref() with the string | 
					
						
							|  |  |  |     containing respectively the named or numeric reference as the | 
					
						
							|  |  |  |     argument. | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     CDATA_CONTENT_ELEMENTS = ("script", "style") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-02 18:36:12 +03:00
										 |  |  |     def __init__(self, *, convert_charrefs=True): | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |         """Initialize and reset this instance.
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-02 18:36:12 +03:00
										 |  |  |         If convert_charrefs is True (the default), all character references | 
					
						
							| 
									
										
										
										
											2013-11-23 19:52:05 +02:00
										 |  |  |         are automatically converted to the corresponding Unicode characters. | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2013-11-23 19:52:05 +02:00
										 |  |  |         self.convert_charrefs = convert_charrefs | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         self.reset() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |         """Reset this instance.  Loses all unprocessed data.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         self.rawdata = '' | 
					
						
							|  |  |  |         self.lasttag = '???' | 
					
						
							|  |  |  |         self.interesting = interesting_normal | 
					
						
							| 
									
										
										
										
											2011-11-01 14:12:22 +02:00
										 |  |  |         self.cdata_elem = None | 
					
						
							| 
									
										
										
										
											2007-12-07 11:10:11 +00:00
										 |  |  |         _markupbase.ParserBase.reset(self) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def feed(self, data): | 
					
						
							| 
									
										
										
										
											2011-05-04 15:55:47 +02:00
										 |  |  |         r"""Feed data to the parser.
 | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         Call this as often as you want, with as little or as much text | 
					
						
							|  |  |  |         as you want (may include '\n'). | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         self.rawdata = self.rawdata + data | 
					
						
							|  |  |  |         self.goahead(0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def close(self): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |         """Handle any buffered data.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         self.goahead(1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     __starttag_text = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def get_starttag_text(self): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |         """Return full source of start tag: '<...>'.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         return self.__starttag_text | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-11-01 14:12:22 +02:00
										 |  |  |     def set_cdata_mode(self, elem): | 
					
						
							|  |  |  |         self.cdata_elem = elem.lower() | 
					
						
							| 
									
										
										
										
											2011-11-18 18:01:49 +02:00
										 |  |  |         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def clear_cdata_mode(self): | 
					
						
							|  |  |  |         self.interesting = interesting_normal | 
					
						
							| 
									
										
										
										
											2011-11-01 14:12:22 +02:00
										 |  |  |         self.cdata_elem = None | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- handle data as far as reasonable.  May leave state | 
					
						
							|  |  |  |     # and data to be processed by a subsequent call.  If 'end' is | 
					
						
							|  |  |  |     # true, force handling all data as if followed by EOF marker. | 
					
						
							|  |  |  |     def goahead(self, end): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         i = 0 | 
					
						
							|  |  |  |         n = len(rawdata) | 
					
						
							|  |  |  |         while i < n: | 
					
						
							| 
									
										
										
										
											2013-11-23 19:52:05 +02:00
										 |  |  |             if self.convert_charrefs and not self.cdata_elem: | 
					
						
							|  |  |  |                 j = rawdata.find('<', i) | 
					
						
							|  |  |  |                 if j < 0: | 
					
						
							|  |  |  |                     if not end: | 
					
						
							|  |  |  |                         break  # wait till we get all the text | 
					
						
							|  |  |  |                     j = n | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2013-11-23 19:52:05 +02:00
										 |  |  |                 match = self.interesting.search(rawdata, i)  # < or & | 
					
						
							|  |  |  |                 if match: | 
					
						
							|  |  |  |                     j = match.start() | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     if self.cdata_elem: | 
					
						
							|  |  |  |                         break | 
					
						
							|  |  |  |                     j = n | 
					
						
							|  |  |  |             if i < j: | 
					
						
							|  |  |  |                 if self.convert_charrefs and not self.cdata_elem: | 
					
						
							|  |  |  |                     self.handle_data(unescape(rawdata[i:j])) | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     self.handle_data(rawdata[i:j]) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             i = self.updatepos(i, j) | 
					
						
							|  |  |  |             if i == n: break | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |             startswith = rawdata.startswith | 
					
						
							|  |  |  |             if startswith('<', i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 if starttagopen.match(rawdata, i): # < + letter | 
					
						
							|  |  |  |                     k = self.parse_starttag(i) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 elif startswith("</", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     k = self.parse_endtag(i) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 elif startswith("<!--", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     k = self.parse_comment(i) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 elif startswith("<?", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     k = self.parse_pi(i) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 elif startswith("<!", i): | 
					
						
							| 
									
										
										
										
											2014-08-02 14:10:30 +03:00
										 |  |  |                     k = self.parse_html_declaration(i) | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                 elif (i + 1) < n: | 
					
						
							| 
									
										
										
										
											2001-08-20 21:24:19 +00:00
										 |  |  |                     self.handle_data("<") | 
					
						
							|  |  |  |                     k = i + 1 | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 if k < 0: | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |                     if not end: | 
					
						
							|  |  |  |                         break | 
					
						
							|  |  |  |                     k = rawdata.find('>', i + 1) | 
					
						
							|  |  |  |                     if k < 0: | 
					
						
							|  |  |  |                         k = rawdata.find('<', i + 1) | 
					
						
							|  |  |  |                         if k < 0: | 
					
						
							|  |  |  |                             k = i + 1 | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         k += 1 | 
					
						
							| 
									
										
										
										
											2013-11-23 19:52:05 +02:00
										 |  |  |                     if self.convert_charrefs and not self.cdata_elem: | 
					
						
							|  |  |  |                         self.handle_data(unescape(rawdata[i:k])) | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         self.handle_data(rawdata[i:k]) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 i = self.updatepos(i, k) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |             elif startswith("&#", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 match = charref.match(rawdata, i) | 
					
						
							|  |  |  |                 if match: | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |                     name = match.group()[2:-1] | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     self.handle_charref(name) | 
					
						
							|  |  |  |                     k = match.end() | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                     if not startswith(';', k-1): | 
					
						
							| 
									
										
										
										
											2001-08-20 21:24:19 +00:00
										 |  |  |                         k = k - 1 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     i = self.updatepos(i, k) | 
					
						
							|  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                 else: | 
					
						
							| 
									
										
										
										
											2014-02-01 21:21:01 +02:00
										 |  |  |                     if ";" in rawdata[i:]:  # bail by consuming &# | 
					
						
							|  |  |  |                         self.handle_data(rawdata[i:i+2]) | 
					
						
							|  |  |  |                         i = self.updatepos(i, i+2) | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                     break | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |             elif startswith('&', i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 match = entityref.match(rawdata, i) | 
					
						
							|  |  |  |                 if match: | 
					
						
							|  |  |  |                     name = match.group(1) | 
					
						
							|  |  |  |                     self.handle_entityref(name) | 
					
						
							|  |  |  |                     k = match.end() | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                     if not startswith(';', k-1): | 
					
						
							| 
									
										
										
										
											2001-08-20 21:24:19 +00:00
										 |  |  |                         k = k - 1 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     i = self.updatepos(i, k) | 
					
						
							|  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2001-08-20 21:24:19 +00:00
										 |  |  |                 match = incomplete.match(rawdata, i) | 
					
						
							|  |  |  |                 if match: | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                     # match.group() will contain at least 2 chars | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                     if end and match.group() == rawdata[i:]: | 
					
						
							| 
									
										
										
										
											2014-08-02 14:10:30 +03:00
										 |  |  |                         k = match.end() | 
					
						
							|  |  |  |                         if k <= i: | 
					
						
							|  |  |  |                             k = n | 
					
						
							|  |  |  |                         i = self.updatepos(i, i + 1) | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                     # incomplete | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  |                 elif (i + 1) < n: | 
					
						
							|  |  |  |                     # not the end of the buffer, and can't be confused | 
					
						
							|  |  |  |                     # with some other construct | 
					
						
							|  |  |  |                     self.handle_data("&") | 
					
						
							|  |  |  |                     i = self.updatepos(i, i + 1) | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 assert 0, "interesting.search() lied" | 
					
						
							|  |  |  |         # end while | 
					
						
							| 
									
										
										
										
											2011-11-18 18:01:49 +02:00
										 |  |  |         if end and i < n and not self.cdata_elem: | 
					
						
							| 
									
										
										
										
											2013-11-23 19:52:05 +02:00
										 |  |  |             if self.convert_charrefs and not self.cdata_elem: | 
					
						
							|  |  |  |                 self.handle_data(unescape(rawdata[i:n])) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 self.handle_data(rawdata[i:n]) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             i = self.updatepos(i, n) | 
					
						
							|  |  |  |         self.rawdata = rawdata[i:] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-13 15:50:37 +02:00
										 |  |  |     # Internal -- parse html declarations, return length or -1 if not terminated | 
					
						
							|  |  |  |     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state | 
					
						
							|  |  |  |     # See also parse_declaration in _markupbase | 
					
						
							|  |  |  |     def parse_html_declaration(self, i): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							| 
									
										
										
										
											2012-06-23 15:27:51 +02:00
										 |  |  |         assert rawdata[i:i+2] == '<!', ('unexpected call to ' | 
					
						
							|  |  |  |                                         'parse_html_declaration()') | 
					
						
							| 
									
										
										
										
											2012-02-13 15:50:37 +02:00
										 |  |  |         if rawdata[i:i+4] == '<!--': | 
					
						
							| 
									
										
										
										
											2012-02-13 20:20:00 +02:00
										 |  |  |             # this case is actually already handled in goahead() | 
					
						
							| 
									
										
										
										
											2012-02-13 15:50:37 +02:00
										 |  |  |             return self.parse_comment(i) | 
					
						
							|  |  |  |         elif rawdata[i:i+3] == '<![': | 
					
						
							|  |  |  |             return self.parse_marked_section(i) | 
					
						
							|  |  |  |         elif rawdata[i:i+9].lower() == '<!doctype': | 
					
						
							|  |  |  |             # find the closing > | 
					
						
							| 
									
										
										
										
											2012-02-13 20:20:00 +02:00
										 |  |  |             gtpos = rawdata.find('>', i+9) | 
					
						
							| 
									
										
										
										
											2012-02-13 15:50:37 +02:00
										 |  |  |             if gtpos == -1: | 
					
						
							|  |  |  |                 return -1 | 
					
						
							|  |  |  |             self.handle_decl(rawdata[i+2:gtpos]) | 
					
						
							|  |  |  |             return gtpos+1 | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             return self.parse_bogus_comment(i) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-10 10:45:44 +02:00
										 |  |  |     # Internal -- parse bogus comment, return length or -1 if not terminated | 
					
						
							|  |  |  |     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state | 
					
						
							|  |  |  |     def parse_bogus_comment(self, i, report=1): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							| 
									
										
										
										
											2012-06-23 15:27:51 +02:00
										 |  |  |         assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' | 
					
						
							|  |  |  |                                                 'parse_comment()') | 
					
						
							| 
									
										
										
										
											2012-02-10 10:45:44 +02:00
										 |  |  |         pos = rawdata.find('>', i+2) | 
					
						
							|  |  |  |         if pos == -1: | 
					
						
							|  |  |  |             return -1 | 
					
						
							|  |  |  |         if report: | 
					
						
							|  |  |  |             self.handle_comment(rawdata[i+2:pos]) | 
					
						
							|  |  |  |         return pos + 1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |     # Internal -- parse processing instr, return end or -1 if not terminated | 
					
						
							|  |  |  |     def parse_pi(self, i): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' | 
					
						
							|  |  |  |         match = piclose.search(rawdata, i+2) # > | 
					
						
							|  |  |  |         if not match: | 
					
						
							|  |  |  |             return -1 | 
					
						
							|  |  |  |         j = match.start() | 
					
						
							|  |  |  |         self.handle_pi(rawdata[i+2: j]) | 
					
						
							|  |  |  |         j = match.end() | 
					
						
							|  |  |  |         return j | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- handle starttag, return end or -1 if not terminated | 
					
						
							|  |  |  |     def parse_starttag(self, i): | 
					
						
							|  |  |  |         self.__starttag_text = None | 
					
						
							|  |  |  |         endpos = self.check_for_whole_start_tag(i) | 
					
						
							|  |  |  |         if endpos < 0: | 
					
						
							|  |  |  |             return endpos | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         self.__starttag_text = rawdata[i:endpos] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Now parse the data between i+1 and j into a tag and attrs | 
					
						
							|  |  |  |         attrs = [] | 
					
						
							| 
									
										
										
										
											2014-08-02 14:10:30 +03:00
										 |  |  |         match = tagfind_tolerant.match(rawdata, i+1) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         assert match, 'unexpected call to parse_starttag()' | 
					
						
							|  |  |  |         k = match.end() | 
					
						
							| 
									
										
										
										
											2012-04-18 19:18:22 -06:00
										 |  |  |         self.lasttag = tag = match.group(1).lower() | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         while k < endpos: | 
					
						
							| 
									
										
										
										
											2014-08-02 14:10:30 +03:00
										 |  |  |             m = attrfind_tolerant.match(rawdata, k) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             if not m: | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  |             attrname, rest, attrvalue = m.group(1, 2, 3) | 
					
						
							|  |  |  |             if not rest: | 
					
						
							|  |  |  |                 attrvalue = None | 
					
						
							|  |  |  |             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | 
					
						
							|  |  |  |                  attrvalue[:1] == '"' == attrvalue[-1:]: | 
					
						
							|  |  |  |                 attrvalue = attrvalue[1:-1] | 
					
						
							| 
									
										
										
										
											2011-11-14 18:53:33 +02:00
										 |  |  |             if attrvalue: | 
					
						
							| 
									
										
										
										
											2013-11-19 20:28:45 +02:00
										 |  |  |                 attrvalue = unescape(attrvalue) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |             attrs.append((attrname.lower(), attrvalue)) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             k = m.end() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |         end = rawdata[k:endpos].strip() | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         if end not in (">", "/>"): | 
					
						
							|  |  |  |             lineno, offset = self.getpos() | 
					
						
							|  |  |  |             if "\n" in self.__starttag_text: | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 lineno = lineno + self.__starttag_text.count("\n") | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 offset = len(self.__starttag_text) \ | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                          - self.__starttag_text.rfind("\n") | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 offset = offset + len(self.__starttag_text) | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |             self.handle_data(rawdata[i:endpos]) | 
					
						
							|  |  |  |             return endpos | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |         if end.endswith('/>'): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             # XHTML-style empty tag: <span attr="value" /> | 
					
						
							|  |  |  |             self.handle_startendtag(tag, attrs) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             self.handle_starttag(tag, attrs) | 
					
						
							|  |  |  |             if tag in self.CDATA_CONTENT_ELEMENTS: | 
					
						
							| 
									
										
										
										
											2011-11-01 14:12:22 +02:00
										 |  |  |                 self.set_cdata_mode(tag) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         return endpos | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- check to see if we have a complete starttag; return end | 
					
						
							|  |  |  |     # or -1 if incomplete. | 
					
						
							|  |  |  |     def check_for_whole_start_tag(self, i): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							| 
									
										
										
										
											2014-08-02 14:10:30 +03:00
										 |  |  |         m = locatestarttagend_tolerant.match(rawdata, i) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         if m: | 
					
						
							|  |  |  |             j = m.end() | 
					
						
							|  |  |  |             next = rawdata[j:j+1] | 
					
						
							|  |  |  |             if next == ">": | 
					
						
							|  |  |  |                 return j + 1 | 
					
						
							|  |  |  |             if next == "/": | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 if rawdata.startswith("/>", j): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     return j + 2 | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 if rawdata.startswith("/", j): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     # buffer boundary | 
					
						
							|  |  |  |                     return -1 | 
					
						
							|  |  |  |                 # else bogus input | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |                 if j > i: | 
					
						
							|  |  |  |                     return j | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     return i + 1 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             if next == "": | 
					
						
							|  |  |  |                 # end of input | 
					
						
							|  |  |  |                 return -1 | 
					
						
							|  |  |  |             if next in ("abcdefghijklmnopqrstuvwxyz=/" | 
					
						
							|  |  |  |                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): | 
					
						
							|  |  |  |                 # end of input in or before attribute value, or we have the | 
					
						
							|  |  |  |                 # '/' from a '/>' ending | 
					
						
							|  |  |  |                 return -1 | 
					
						
							| 
									
										
										
										
											2010-12-03 04:06:39 +00:00
										 |  |  |             if j > i: | 
					
						
							|  |  |  |                 return j | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 return i + 1 | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |         raise AssertionError("we should not get here!") | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- parse endtag, return end or -1 if incomplete | 
					
						
							|  |  |  |     def parse_endtag(self, i): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" | 
					
						
							|  |  |  |         match = endendtag.search(rawdata, i+1) # > | 
					
						
							|  |  |  |         if not match: | 
					
						
							|  |  |  |             return -1 | 
					
						
							| 
									
										
										
										
											2012-02-13 11:24:50 +02:00
										 |  |  |         gtpos = match.end() | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         match = endtagfind.match(rawdata, i) # </ + tag + > | 
					
						
							|  |  |  |         if not match: | 
					
						
							| 
									
										
										
										
											2011-11-01 14:12:22 +02:00
										 |  |  |             if self.cdata_elem is not None: | 
					
						
							| 
									
										
										
										
											2012-02-13 11:24:50 +02:00
										 |  |  |                 self.handle_data(rawdata[i:gtpos]) | 
					
						
							|  |  |  |                 return gtpos | 
					
						
							|  |  |  |             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state | 
					
						
							|  |  |  |             namematch = tagfind_tolerant.match(rawdata, i+2) | 
					
						
							|  |  |  |             if not namematch: | 
					
						
							|  |  |  |                 # w3.org/TR/html5/tokenization.html#end-tag-open-state | 
					
						
							|  |  |  |                 if rawdata[i:i+3] == '</>': | 
					
						
							|  |  |  |                     return i+3 | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     return self.parse_bogus_comment(i) | 
					
						
							| 
									
										
										
										
											2013-11-07 18:33:24 +02:00
										 |  |  |             tagname = namematch.group(1).lower() | 
					
						
							| 
									
										
										
										
											2012-02-13 11:24:50 +02:00
										 |  |  |             # consume and ignore other stuff between the name and the > | 
					
						
							|  |  |  |             # Note: this is not 100% correct, since we might have things like | 
					
						
							|  |  |  |             # </tag attr=">">, but looking for > after tha name should cover | 
					
						
							|  |  |  |             # most of the cases and is much simpler | 
					
						
							|  |  |  |             gtpos = rawdata.find('>', namematch.end()) | 
					
						
							|  |  |  |             self.handle_endtag(tagname) | 
					
						
							|  |  |  |             return gtpos+1 | 
					
						
							| 
									
										
										
										
											2011-11-01 14:12:22 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         elem = match.group(1).lower() # script or style | 
					
						
							|  |  |  |         if self.cdata_elem is not None: | 
					
						
							|  |  |  |             if elem != self.cdata_elem: | 
					
						
							| 
									
										
										
										
											2012-02-13 11:24:50 +02:00
										 |  |  |                 self.handle_data(rawdata[i:gtpos]) | 
					
						
							|  |  |  |                 return gtpos | 
					
						
							| 
									
										
										
										
											2011-11-01 14:12:22 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         self.handle_endtag(elem.lower()) | 
					
						
							| 
									
										
										
										
											2002-05-14 15:50:11 +00:00
										 |  |  |         self.clear_cdata_mode() | 
					
						
							| 
									
										
										
										
											2012-02-13 11:24:50 +02:00
										 |  |  |         return gtpos | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- finish processing of start+end tag: <tag.../> | 
					
						
							|  |  |  |     def handle_startendtag(self, tag, attrs): | 
					
						
							|  |  |  |         self.handle_starttag(tag, attrs) | 
					
						
							|  |  |  |         self.handle_endtag(tag) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle start tag | 
					
						
							|  |  |  |     def handle_starttag(self, tag, attrs): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle end tag | 
					
						
							|  |  |  |     def handle_endtag(self, tag): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle character reference | 
					
						
							|  |  |  |     def handle_charref(self, name): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle entity reference | 
					
						
							|  |  |  |     def handle_entityref(self, name): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle data | 
					
						
							|  |  |  |     def handle_data(self, data): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle comment | 
					
						
							|  |  |  |     def handle_comment(self, data): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle declaration | 
					
						
							|  |  |  |     def handle_decl(self, decl): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle processing instruction | 
					
						
							|  |  |  |     def handle_pi(self, data): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |     def unknown_decl(self, data): | 
					
						
							| 
									
										
										
										
											2014-08-02 14:10:30 +03:00
										 |  |  |         pass | 
					
						
							| 
									
										
										
										
											2013-11-22 05:49:29 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- helper to remove special character quoting | 
					
						
							|  |  |  |     def unescape(self, s): | 
					
						
							|  |  |  |         warnings.warn('The unescape method is deprecated and will be removed ' | 
					
						
							|  |  |  |                       'in 3.5, use html.unescape() instead.', | 
					
						
							|  |  |  |                       DeprecationWarning, stacklevel=2) | 
					
						
							|  |  |  |         return unescape(s) |