| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  | """A parser for HTML and XHTML.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # This file is based on sgmllib.py, but the API is slightly different. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # XXX There should be a way to distinguish between PCDATA (parsed | 
					
						
							|  |  |  | # character data -- the normal case), RCDATA (replaceable character | 
					
						
							|  |  |  | # data -- only char and entity references and end tags are special) | 
					
						
							|  |  |  | # and CDATA (character data -- only end tags are special). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  | import markupbase | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | import re | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Regular expressions used for parsing | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | interesting_normal = re.compile('[&<]') | 
					
						
							|  |  |  | interesting_cdata = re.compile(r'<(/|\Z)') | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  | incomplete = re.compile('&[a-zA-Z#]') | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  | charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | starttagopen = re.compile('<[a-zA-Z]') | 
					
						
							|  |  |  | piclose = re.compile('>') | 
					
						
							|  |  |  | commentclose = re.compile(r'--\s*>') | 
					
						
							|  |  |  | tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') | 
					
						
							|  |  |  | attrfind = re.compile( | 
					
						
							|  |  |  |     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' | 
					
						
							|  |  |  |     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | locatestarttagend = re.compile(r"""
 | 
					
						
							|  |  |  |   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name | 
					
						
							|  |  |  |   (?:\s+                             # whitespace before attribute name | 
					
						
							|  |  |  |     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name | 
					
						
							|  |  |  |       (?:\s*=\s*                     # value indicator | 
					
						
							|  |  |  |         (?:'[^']*'                   # LITA-enclosed value | 
					
						
							|  |  |  |           |\"[^\"]*\"                # LIT-enclosed value | 
					
						
							|  |  |  |           |[^'\">\s]+                # bare value | 
					
						
							|  |  |  |          ) | 
					
						
							|  |  |  |        )? | 
					
						
							|  |  |  |      ) | 
					
						
							|  |  |  |    )* | 
					
						
							|  |  |  |   \s*                                # trailing whitespace | 
					
						
							|  |  |  | """, re.VERBOSE)
 | 
					
						
							|  |  |  | endendtag = re.compile('>') | 
					
						
							|  |  |  | endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class HTMLParseError(Exception): | 
					
						
							|  |  |  |     """Exception raised for all parse errors.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__(self, msg, position=(None, None)): | 
					
						
							|  |  |  |         assert msg | 
					
						
							|  |  |  |         self.msg = msg | 
					
						
							|  |  |  |         self.lineno = position[0] | 
					
						
							|  |  |  |         self.offset = position[1] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __str__(self): | 
					
						
							|  |  |  |         result = self.msg | 
					
						
							|  |  |  |         if self.lineno is not None: | 
					
						
							|  |  |  |             result = result + ", at line %d" % self.lineno | 
					
						
							|  |  |  |         if self.offset is not None: | 
					
						
							|  |  |  |             result = result + ", column %d" % (self.offset + 1) | 
					
						
							|  |  |  |         return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  | class HTMLParser(markupbase.ParserBase): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |     """Find tags and other markup and call handler functions.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Usage: | 
					
						
							|  |  |  |         p = HTMLParser() | 
					
						
							|  |  |  |         p.feed(data) | 
					
						
							|  |  |  |         ... | 
					
						
							|  |  |  |         p.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Start tags are handled by calling self.handle_starttag() or | 
					
						
							|  |  |  |     self.handle_startendtag(); end tags by self.handle_endtag().  The | 
					
						
							|  |  |  |     data between tags is passed from the parser to the derived class | 
					
						
							|  |  |  |     by calling self.handle_data() with the data as argument (the data | 
					
						
							|  |  |  |     may be split up in arbitrary chunks).  Entity references are | 
					
						
							|  |  |  |     passed by calling self.handle_entityref() with the entity | 
					
						
							|  |  |  |     reference as the argument.  Numeric character references are | 
					
						
							|  |  |  |     passed to self.handle_charref() with the string containing the | 
					
						
							|  |  |  |     reference as the argument. | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     CDATA_CONTENT_ELEMENTS = ("script", "style") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__(self): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |         """Initialize and reset this instance.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         self.reset() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |         """Reset this instance.  Loses all unprocessed data.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         self.rawdata = '' | 
					
						
							|  |  |  |         self.lasttag = '???' | 
					
						
							|  |  |  |         self.interesting = interesting_normal | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |         markupbase.ParserBase.reset(self) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def feed(self, data): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |         """Feed data to the parser.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Call this as often as you want, with as little or as much text | 
					
						
							|  |  |  |         as you want (may include '\n'). | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         self.rawdata = self.rawdata + data | 
					
						
							|  |  |  |         self.goahead(0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def close(self): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |         """Handle any buffered data.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         self.goahead(1) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |     def error(self, message): | 
					
						
							|  |  |  |         raise HTMLParseError(message, self.getpos()) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     __starttag_text = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def get_starttag_text(self): | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |         """Return full source of start tag: '<...>'.""" | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         return self.__starttag_text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def set_cdata_mode(self): | 
					
						
							|  |  |  |         self.interesting = interesting_cdata | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def clear_cdata_mode(self): | 
					
						
							|  |  |  |         self.interesting = interesting_normal | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- handle data as far as reasonable.  May leave state | 
					
						
							|  |  |  |     # and data to be processed by a subsequent call.  If 'end' is | 
					
						
							|  |  |  |     # true, force handling all data as if followed by EOF marker. | 
					
						
							|  |  |  |     def goahead(self, end): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         i = 0 | 
					
						
							|  |  |  |         n = len(rawdata) | 
					
						
							|  |  |  |         while i < n: | 
					
						
							|  |  |  |             match = self.interesting.search(rawdata, i) # < or & | 
					
						
							|  |  |  |             if match: | 
					
						
							|  |  |  |                 j = match.start() | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 j = n | 
					
						
							|  |  |  |             if i < j: self.handle_data(rawdata[i:j]) | 
					
						
							|  |  |  |             i = self.updatepos(i, j) | 
					
						
							|  |  |  |             if i == n: break | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |             startswith = rawdata.startswith | 
					
						
							|  |  |  |             if startswith('<', i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 if starttagopen.match(rawdata, i): # < + letter | 
					
						
							|  |  |  |                     k = self.parse_starttag(i) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 elif startswith("</", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     k = self.parse_endtag(i) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 elif startswith("<!--", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     k = self.parse_comment(i) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 elif startswith("<?", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     k = self.parse_pi(i) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 elif startswith("<!", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     k = self.parse_declaration(i) | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                 elif (i + 1) < n: | 
					
						
							| 
									
										
										
										
											2001-08-20 21:24:19 +00:00
										 |  |  |                     self.handle_data("<") | 
					
						
							|  |  |  |                     k = i + 1 | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 if k < 0: | 
					
						
							|  |  |  |                     if end: | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |                         self.error("EOF in middle of construct") | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     break | 
					
						
							|  |  |  |                 i = self.updatepos(i, k) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |             elif startswith("&#", i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 match = charref.match(rawdata, i) | 
					
						
							|  |  |  |                 if match: | 
					
						
							| 
									
										
										
										
											2001-08-03 19:50:59 +00:00
										 |  |  |                     name = match.group()[2:-1] | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     self.handle_charref(name) | 
					
						
							|  |  |  |                     k = match.end() | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                     if not startswith(';', k-1): | 
					
						
							| 
									
										
										
										
											2001-08-20 21:24:19 +00:00
										 |  |  |                         k = k - 1 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     i = self.updatepos(i, k) | 
					
						
							|  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |             elif startswith('&', i): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 match = entityref.match(rawdata, i) | 
					
						
							|  |  |  |                 if match: | 
					
						
							|  |  |  |                     name = match.group(1) | 
					
						
							|  |  |  |                     self.handle_entityref(name) | 
					
						
							|  |  |  |                     k = match.end() | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                     if not startswith(';', k-1): | 
					
						
							| 
									
										
										
										
											2001-08-20 21:24:19 +00:00
										 |  |  |                         k = k - 1 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     i = self.updatepos(i, k) | 
					
						
							|  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2001-08-20 21:24:19 +00:00
										 |  |  |                 match = incomplete.match(rawdata, i) | 
					
						
							|  |  |  |                 if match: | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                     # match.group() will contain at least 2 chars | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                     if end and match.group() == rawdata[i:]: | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |                         self.error("EOF in middle of entity or char ref") | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |                     # incomplete | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  |                 elif (i + 1) < n: | 
					
						
							|  |  |  |                     # not the end of the buffer, and can't be confused | 
					
						
							|  |  |  |                     # with some other construct | 
					
						
							|  |  |  |                     self.handle_data("&") | 
					
						
							|  |  |  |                     i = self.updatepos(i, i + 1) | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 assert 0, "interesting.search() lied" | 
					
						
							|  |  |  |         # end while | 
					
						
							|  |  |  |         if end and i < n: | 
					
						
							|  |  |  |             self.handle_data(rawdata[i:n]) | 
					
						
							|  |  |  |             i = self.updatepos(i, n) | 
					
						
							|  |  |  |         self.rawdata = rawdata[i:] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- parse comment, return end or -1 if not terminated | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |     def parse_comment(self, i, report=1): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()' | 
					
						
							|  |  |  |         match = commentclose.search(rawdata, i+4) | 
					
						
							|  |  |  |         if not match: | 
					
						
							|  |  |  |             return -1 | 
					
						
							| 
									
										
										
										
											2001-09-04 15:10:16 +00:00
										 |  |  |         if report: | 
					
						
							|  |  |  |             j = match.start() | 
					
						
							|  |  |  |             self.handle_comment(rawdata[i+4: j]) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         j = match.end() | 
					
						
							|  |  |  |         return j | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- parse processing instr, return end or -1 if not terminated | 
					
						
							|  |  |  |     def parse_pi(self, i): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' | 
					
						
							|  |  |  |         match = piclose.search(rawdata, i+2) # > | 
					
						
							|  |  |  |         if not match: | 
					
						
							|  |  |  |             return -1 | 
					
						
							|  |  |  |         j = match.start() | 
					
						
							|  |  |  |         self.handle_pi(rawdata[i+2: j]) | 
					
						
							|  |  |  |         j = match.end() | 
					
						
							|  |  |  |         return j | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- handle starttag, return end or -1 if not terminated | 
					
						
							|  |  |  |     def parse_starttag(self, i): | 
					
						
							|  |  |  |         self.__starttag_text = None | 
					
						
							|  |  |  |         endpos = self.check_for_whole_start_tag(i) | 
					
						
							|  |  |  |         if endpos < 0: | 
					
						
							|  |  |  |             return endpos | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         self.__starttag_text = rawdata[i:endpos] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Now parse the data between i+1 and j into a tag and attrs | 
					
						
							|  |  |  |         attrs = [] | 
					
						
							|  |  |  |         match = tagfind.match(rawdata, i+1) | 
					
						
							|  |  |  |         assert match, 'unexpected call to parse_starttag()' | 
					
						
							|  |  |  |         k = match.end() | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |         self.lasttag = tag = rawdata[i+1:k].lower() | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         while k < endpos: | 
					
						
							|  |  |  |             m = attrfind.match(rawdata, k) | 
					
						
							|  |  |  |             if not m: | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  |             attrname, rest, attrvalue = m.group(1, 2, 3) | 
					
						
							|  |  |  |             if not rest: | 
					
						
							|  |  |  |                 attrvalue = None | 
					
						
							|  |  |  |             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | 
					
						
							|  |  |  |                  attrvalue[:1] == '"' == attrvalue[-1:]: | 
					
						
							|  |  |  |                 attrvalue = attrvalue[1:-1] | 
					
						
							|  |  |  |                 attrvalue = self.unescape(attrvalue) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |             attrs.append((attrname.lower(), attrvalue)) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             k = m.end() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |         end = rawdata[k:endpos].strip() | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         if end not in (">", "/>"): | 
					
						
							|  |  |  |             lineno, offset = self.getpos() | 
					
						
							|  |  |  |             if "\n" in self.__starttag_text: | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 lineno = lineno + self.__starttag_text.count("\n") | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                 offset = len(self.__starttag_text) \ | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                          - self.__starttag_text.rfind("\n") | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 offset = offset + len(self.__starttag_text) | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |             self.error("junk characters in start tag: %s" | 
					
						
							|  |  |  |                        % `rawdata[k:endpos][:20]`) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |         if end.endswith('/>'): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             # XHTML-style empty tag: <span attr="value" /> | 
					
						
							|  |  |  |             self.handle_startendtag(tag, attrs) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             self.handle_starttag(tag, attrs) | 
					
						
							|  |  |  |             if tag in self.CDATA_CONTENT_ELEMENTS: | 
					
						
							|  |  |  |                 self.set_cdata_mode() | 
					
						
							|  |  |  |         return endpos | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- check to see if we have a complete starttag; return end | 
					
						
							|  |  |  |     # or -1 if incomplete. | 
					
						
							|  |  |  |     def check_for_whole_start_tag(self, i): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         m = locatestarttagend.match(rawdata, i) | 
					
						
							|  |  |  |         if m: | 
					
						
							|  |  |  |             j = m.end() | 
					
						
							|  |  |  |             next = rawdata[j:j+1] | 
					
						
							|  |  |  |             if next == ">": | 
					
						
							|  |  |  |                 return j + 1 | 
					
						
							|  |  |  |             if next == "/": | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 if rawdata.startswith("/>", j): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     return j + 2 | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |                 if rawdata.startswith("/", j): | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |                     # buffer boundary | 
					
						
							|  |  |  |                     return -1 | 
					
						
							|  |  |  |                 # else bogus input | 
					
						
							|  |  |  |                 self.updatepos(i, j + 1) | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |                 self.error("malformed empty start tag") | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |             if next == "": | 
					
						
							|  |  |  |                 # end of input | 
					
						
							|  |  |  |                 return -1 | 
					
						
							|  |  |  |             if next in ("abcdefghijklmnopqrstuvwxyz=/" | 
					
						
							|  |  |  |                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): | 
					
						
							|  |  |  |                 # end of input in or before attribute value, or we have the | 
					
						
							|  |  |  |                 # '/' from a '/>' ending | 
					
						
							|  |  |  |                 return -1 | 
					
						
							|  |  |  |             self.updatepos(i, j) | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |             self.error("malformed start tag") | 
					
						
							|  |  |  |         raise AssertionError("we should not get here!") | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Internal -- parse endtag, return end or -1 if incomplete | 
					
						
							|  |  |  |     def parse_endtag(self, i): | 
					
						
							|  |  |  |         rawdata = self.rawdata | 
					
						
							|  |  |  |         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" | 
					
						
							|  |  |  |         match = endendtag.search(rawdata, i+1) # > | 
					
						
							|  |  |  |         if not match: | 
					
						
							|  |  |  |             return -1 | 
					
						
							|  |  |  |         j = match.end() | 
					
						
							|  |  |  |         match = endtagfind.match(rawdata, i) # </ + tag + > | 
					
						
							|  |  |  |         if not match: | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |             self.error("bad end tag: %s" % `rawdata[i:j]`) | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         tag = match.group(1) | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |         self.handle_endtag(tag.lower()) | 
					
						
							| 
									
										
										
										
											2002-05-14 15:50:11 +00:00
										 |  |  |         self.clear_cdata_mode() | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         return j | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- finish processing of start+end tag: <tag.../> | 
					
						
							|  |  |  |     def handle_startendtag(self, tag, attrs): | 
					
						
							|  |  |  |         self.handle_starttag(tag, attrs) | 
					
						
							|  |  |  |         self.handle_endtag(tag) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle start tag | 
					
						
							|  |  |  |     def handle_starttag(self, tag, attrs): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle end tag | 
					
						
							|  |  |  |     def handle_endtag(self, tag): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle character reference | 
					
						
							|  |  |  |     def handle_charref(self, name): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle entity reference | 
					
						
							|  |  |  |     def handle_entityref(self, name): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle data | 
					
						
							|  |  |  |     def handle_data(self, data): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle comment | 
					
						
							|  |  |  |     def handle_comment(self, data): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle declaration | 
					
						
							|  |  |  |     def handle_decl(self, decl): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Overridable -- handle processing instruction | 
					
						
							|  |  |  |     def handle_pi(self, data): | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-09-24 20:10:28 +00:00
										 |  |  |     def unknown_decl(self, data): | 
					
						
							|  |  |  |         self.error("unknown declaration: " + `data`) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |     # Internal -- helper to remove special character quoting | 
					
						
							|  |  |  |     def unescape(self, s): | 
					
						
							|  |  |  |         if '&' not in s: | 
					
						
							|  |  |  |             return s | 
					
						
							| 
									
										
										
										
											2001-12-03 17:09:50 +00:00
										 |  |  |         s = s.replace("<", "<") | 
					
						
							|  |  |  |         s = s.replace(">", ">") | 
					
						
							|  |  |  |         s = s.replace("'", "'") | 
					
						
							|  |  |  |         s = s.replace(""", '"') | 
					
						
							|  |  |  |         s = s.replace("&", "&") # Must be last | 
					
						
							| 
									
										
										
										
											2001-05-18 14:50:52 +00:00
										 |  |  |         return s |