mirror of
				https://github.com/python/cpython.git
				synced 2025-10-28 04:04:44 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			486 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			486 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """HTML 2.0 parser.
 | |
| 
 | |
| See the HTML 2.0 specification:
 | |
| http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
 | |
| """
 | |
| 
 | |
| import sgmllib
 | |
| 
 | |
| from formatter import AS_IS
 | |
| 
 | |
| __all__ = ["HTMLParser", "HTMLParseError"]
 | |
| 
 | |
| 
 | |
| class HTMLParseError(sgmllib.SGMLParseError):
 | |
|     """Error raised when an HTML document can't be parsed."""
 | |
| 
 | |
| 
 | |
| class HTMLParser(sgmllib.SGMLParser):
 | |
|     """This is the basic HTML parser class.
 | |
| 
 | |
|     It supports all entity names required by the XHTML 1.0 Recommendation.
 | |
|     It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
 | |
|     elements.
 | |
| 
 | |
|     """
 | |
| 
 | |
|     from htmlentitydefs import entitydefs
 | |
| 
 | |
|     def __init__(self, formatter, verbose=0):
 | |
|         """Creates an instance of the HTMLParser class.
 | |
| 
 | |
|         The formatter parameter is the formatter instance associated with
 | |
|         the parser.
 | |
| 
 | |
|         """
 | |
|         sgmllib.SGMLParser.__init__(self, verbose)
 | |
|         self.formatter = formatter
 | |
| 
 | |
|     def error(self, message):
 | |
|         raise HTMLParseError(message)
 | |
| 
 | |
|     def reset(self):
 | |
|         sgmllib.SGMLParser.reset(self)
 | |
|         self.savedata = None
 | |
|         self.isindex = 0
 | |
|         self.title = None
 | |
|         self.base = None
 | |
|         self.anchor = None
 | |
|         self.anchorlist = []
 | |
|         self.nofill = 0
 | |
|         self.list_stack = []
 | |
| 
 | |
|     # ------ Methods used internally; some may be overridden
 | |
| 
 | |
|     # --- Formatter interface, taking care of 'savedata' mode;
 | |
|     # shouldn't need to be overridden
 | |
| 
 | |
|     def handle_data(self, data):
 | |
|         if self.savedata is not None:
 | |
|             self.savedata = self.savedata + data
 | |
|         else:
 | |
|             if self.nofill:
 | |
|                 self.formatter.add_literal_data(data)
 | |
|             else:
 | |
|                 self.formatter.add_flowing_data(data)
 | |
| 
 | |
|     # --- Hooks to save data; shouldn't need to be overridden
 | |
| 
 | |
|     def save_bgn(self):
 | |
|         """Begins saving character data in a buffer instead of sending it
 | |
|         to the formatter object.
 | |
| 
 | |
|         Retrieve the stored data via the save_end() method.  Use of the
 | |
|         save_bgn() / save_end() pair may not be nested.
 | |
| 
 | |
|         """
 | |
|         self.savedata = ''
 | |
| 
 | |
|     def save_end(self):
 | |
|         """Ends buffering character data and returns all data saved since
 | |
|         the preceding call to the save_bgn() method.
 | |
| 
 | |
|         If the nofill flag is false, whitespace is collapsed to single
 | |
|         spaces.  A call to this method without a preceding call to the
 | |
|         save_bgn() method will raise a TypeError exception.
 | |
| 
 | |
|         """
 | |
|         data = self.savedata
 | |
|         self.savedata = None
 | |
|         if not self.nofill:
 | |
|             data = ' '.join(data.split())
 | |
|         return data
 | |
| 
 | |
|     # --- Hooks for anchors; should probably be overridden
 | |
| 
 | |
|     def anchor_bgn(self, href, name, type):
 | |
|         """This method is called at the start of an anchor region.
 | |
| 
 | |
|         The arguments correspond to the attributes of the <A> tag with
 | |
|         the same names.  The default implementation maintains a list of
 | |
|         hyperlinks (defined by the HREF attribute for <A> tags) within
 | |
|         the document.  The list of hyperlinks is available as the data
 | |
|         attribute anchorlist.
 | |
| 
 | |
|         """
 | |
|         self.anchor = href
 | |
|         if self.anchor:
 | |
|             self.anchorlist.append(href)
 | |
| 
 | |
|     def anchor_end(self):
 | |
|         """This method is called at the end of an anchor region.
 | |
| 
 | |
|         The default implementation adds a textual footnote marker using an
 | |
|         index into the list of hyperlinks created by the anchor_bgn()method.
 | |
| 
 | |
|         """
 | |
|         if self.anchor:
 | |
|             self.handle_data("[%d]" % len(self.anchorlist))
 | |
|             self.anchor = None
 | |
| 
 | |
|     # --- Hook for images; should probably be overridden
 | |
| 
 | |
|     def handle_image(self, src, alt, *args):
 | |
|         """This method is called to handle images.
 | |
| 
 | |
|         The default implementation simply passes the alt value to the
 | |
|         handle_data() method.
 | |
| 
 | |
|         """
 | |
|         self.handle_data(alt)
 | |
| 
 | |
|     # --------- Top level elememts
 | |
| 
 | |
|     def start_html(self, attrs): pass
 | |
|     def end_html(self): pass
 | |
| 
 | |
|     def start_head(self, attrs): pass
 | |
|     def end_head(self): pass
 | |
| 
 | |
|     def start_body(self, attrs): pass
 | |
|     def end_body(self): pass
 | |
| 
 | |
|     # ------ Head elements
 | |
| 
 | |
|     def start_title(self, attrs):
 | |
|         self.save_bgn()
 | |
| 
 | |
|     def end_title(self):
 | |
|         self.title = self.save_end()
 | |
| 
 | |
|     def do_base(self, attrs):
 | |
|         for a, v in attrs:
 | |
|             if a == 'href':
 | |
|                 self.base = v
 | |
| 
 | |
|     def do_isindex(self, attrs):
 | |
|         self.isindex = 1
 | |
| 
 | |
|     def do_link(self, attrs):
 | |
|         pass
 | |
| 
 | |
|     def do_meta(self, attrs):
 | |
|         pass
 | |
| 
 | |
|     def do_nextid(self, attrs): # Deprecated
 | |
|         pass
 | |
| 
 | |
|     # ------ Body elements
 | |
| 
 | |
|     # --- Headings
 | |
| 
 | |
|     def start_h1(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.push_font(('h1', 0, 1, 0))
 | |
| 
 | |
|     def end_h1(self):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_h2(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.push_font(('h2', 0, 1, 0))
 | |
| 
 | |
|     def end_h2(self):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_h3(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.push_font(('h3', 0, 1, 0))
 | |
| 
 | |
|     def end_h3(self):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_h4(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.push_font(('h4', 0, 1, 0))
 | |
| 
 | |
|     def end_h4(self):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_h5(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.push_font(('h5', 0, 1, 0))
 | |
| 
 | |
|     def end_h5(self):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_h6(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.push_font(('h6', 0, 1, 0))
 | |
| 
 | |
|     def end_h6(self):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     # --- Block Structuring Elements
 | |
| 
 | |
|     def do_p(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
| 
 | |
|     def start_pre(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
 | |
|         self.nofill = self.nofill + 1
 | |
| 
 | |
|     def end_pre(self):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.pop_font()
 | |
|         self.nofill = max(0, self.nofill - 1)
 | |
| 
 | |
|     def start_xmp(self, attrs):
 | |
|         self.start_pre(attrs)
 | |
|         self.setliteral('xmp') # Tell SGML parser
 | |
| 
 | |
|     def end_xmp(self):
 | |
|         self.end_pre()
 | |
| 
 | |
|     def start_listing(self, attrs):
 | |
|         self.start_pre(attrs)
 | |
|         self.setliteral('listing') # Tell SGML parser
 | |
| 
 | |
|     def end_listing(self):
 | |
|         self.end_pre()
 | |
| 
 | |
|     def start_address(self, attrs):
 | |
|         self.formatter.end_paragraph(0)
 | |
|         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
 | |
| 
 | |
|     def end_address(self):
 | |
|         self.formatter.end_paragraph(0)
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_blockquote(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.push_margin('blockquote')
 | |
| 
 | |
|     def end_blockquote(self):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.formatter.pop_margin()
 | |
| 
 | |
|     # --- List Elements
 | |
| 
 | |
|     def start_ul(self, attrs):
 | |
|         self.formatter.end_paragraph(not self.list_stack)
 | |
|         self.formatter.push_margin('ul')
 | |
|         self.list_stack.append(['ul', '*', 0])
 | |
| 
 | |
|     def end_ul(self):
 | |
|         if self.list_stack: del self.list_stack[-1]
 | |
|         self.formatter.end_paragraph(not self.list_stack)
 | |
|         self.formatter.pop_margin()
 | |
| 
 | |
|     def do_li(self, attrs):
 | |
|         self.formatter.end_paragraph(0)
 | |
|         if self.list_stack:
 | |
|             [dummy, label, counter] = top = self.list_stack[-1]
 | |
|             top[2] = counter = counter+1
 | |
|         else:
 | |
|             label, counter = '*', 0
 | |
|         self.formatter.add_label_data(label, counter)
 | |
| 
 | |
|     def start_ol(self, attrs):
 | |
|         self.formatter.end_paragraph(not self.list_stack)
 | |
|         self.formatter.push_margin('ol')
 | |
|         label = '1.'
 | |
|         for a, v in attrs:
 | |
|             if a == 'type':
 | |
|                 if len(v) == 1: v = v + '.'
 | |
|                 label = v
 | |
|         self.list_stack.append(['ol', label, 0])
 | |
| 
 | |
|     def end_ol(self):
 | |
|         if self.list_stack: del self.list_stack[-1]
 | |
|         self.formatter.end_paragraph(not self.list_stack)
 | |
|         self.formatter.pop_margin()
 | |
| 
 | |
|     def start_menu(self, attrs):
 | |
|         self.start_ul(attrs)
 | |
| 
 | |
|     def end_menu(self):
 | |
|         self.end_ul()
 | |
| 
 | |
|     def start_dir(self, attrs):
 | |
|         self.start_ul(attrs)
 | |
| 
 | |
|     def end_dir(self):
 | |
|         self.end_ul()
 | |
| 
 | |
|     def start_dl(self, attrs):
 | |
|         self.formatter.end_paragraph(1)
 | |
|         self.list_stack.append(['dl', '', 0])
 | |
| 
 | |
|     def end_dl(self):
 | |
|         self.ddpop(1)
 | |
|         if self.list_stack: del self.list_stack[-1]
 | |
| 
 | |
|     def do_dt(self, attrs):
 | |
|         self.ddpop()
 | |
| 
 | |
|     def do_dd(self, attrs):
 | |
|         self.ddpop()
 | |
|         self.formatter.push_margin('dd')
 | |
|         self.list_stack.append(['dd', '', 0])
 | |
| 
 | |
|     def ddpop(self, bl=0):
 | |
|         self.formatter.end_paragraph(bl)
 | |
|         if self.list_stack:
 | |
|             if self.list_stack[-1][0] == 'dd':
 | |
|                 del self.list_stack[-1]
 | |
|                 self.formatter.pop_margin()
 | |
| 
 | |
|     # --- Phrase Markup
 | |
| 
 | |
|     # Idiomatic Elements
 | |
| 
 | |
|     def start_cite(self, attrs): self.start_i(attrs)
 | |
|     def end_cite(self): self.end_i()
 | |
| 
 | |
|     def start_code(self, attrs): self.start_tt(attrs)
 | |
|     def end_code(self): self.end_tt()
 | |
| 
 | |
|     def start_em(self, attrs): self.start_i(attrs)
 | |
|     def end_em(self): self.end_i()
 | |
| 
 | |
|     def start_kbd(self, attrs): self.start_tt(attrs)
 | |
|     def end_kbd(self): self.end_tt()
 | |
| 
 | |
|     def start_samp(self, attrs): self.start_tt(attrs)
 | |
|     def end_samp(self): self.end_tt()
 | |
| 
 | |
|     def start_strong(self, attrs): self.start_b(attrs)
 | |
|     def end_strong(self): self.end_b()
 | |
| 
 | |
|     def start_var(self, attrs): self.start_i(attrs)
 | |
|     def end_var(self): self.end_i()
 | |
| 
 | |
|     # Typographic Elements
 | |
| 
 | |
|     def start_i(self, attrs):
 | |
|         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
 | |
|     def end_i(self):
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_b(self, attrs):
 | |
|         self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
 | |
|     def end_b(self):
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_tt(self, attrs):
 | |
|         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
 | |
|     def end_tt(self):
 | |
|         self.formatter.pop_font()
 | |
| 
 | |
|     def start_a(self, attrs):
 | |
|         href = ''
 | |
|         name = ''
 | |
|         type = ''
 | |
|         for attrname, value in attrs:
 | |
|             value = value.strip()
 | |
|             if attrname == 'href':
 | |
|                 href = value
 | |
|             if attrname == 'name':
 | |
|                 name = value
 | |
|             if attrname == 'type':
 | |
|                 type = value.lower()
 | |
|         self.anchor_bgn(href, name, type)
 | |
| 
 | |
|     def end_a(self):
 | |
|         self.anchor_end()
 | |
| 
 | |
|     # --- Line Break
 | |
| 
 | |
|     def do_br(self, attrs):
 | |
|         self.formatter.add_line_break()
 | |
| 
 | |
|     # --- Horizontal Rule
 | |
| 
 | |
|     def do_hr(self, attrs):
 | |
|         self.formatter.add_hor_rule()
 | |
| 
 | |
|     # --- Image
 | |
| 
 | |
|     def do_img(self, attrs):
 | |
|         align = ''
 | |
|         alt = '(image)'
 | |
|         ismap = ''
 | |
|         src = ''
 | |
|         width = 0
 | |
|         height = 0
 | |
|         for attrname, value in attrs:
 | |
|             if attrname == 'align':
 | |
|                 align = value
 | |
|             if attrname == 'alt':
 | |
|                 alt = value
 | |
|             if attrname == 'ismap':
 | |
|                 ismap = value
 | |
|             if attrname == 'src':
 | |
|                 src = value
 | |
|             if attrname == 'width':
 | |
|                 try: width = int(value)
 | |
|                 except ValueError: pass
 | |
|             if attrname == 'height':
 | |
|                 try: height = int(value)
 | |
|                 except ValueError: pass
 | |
|         self.handle_image(src, alt, ismap, align, width, height)
 | |
| 
 | |
|     # --- Really Old Unofficial Deprecated Stuff
 | |
| 
 | |
|     def do_plaintext(self, attrs):
 | |
|         self.start_pre(attrs)
 | |
|         self.setnomoretags() # Tell SGML parser
 | |
| 
 | |
|     # --- Unhandled tags
 | |
| 
 | |
|     def unknown_starttag(self, tag, attrs):
 | |
|         pass
 | |
| 
 | |
|     def unknown_endtag(self, tag):
 | |
|         pass
 | |
| 
 | |
| 
 | |
| def test(args = None):
 | |
|     import sys, formatter
 | |
| 
 | |
|     if not args:
 | |
|         args = sys.argv[1:]
 | |
| 
 | |
|     silent = args and args[0] == '-s'
 | |
|     if silent:
 | |
|         del args[0]
 | |
| 
 | |
|     if args:
 | |
|         file = args[0]
 | |
|     else:
 | |
|         file = 'test.html'
 | |
| 
 | |
|     if file == '-':
 | |
|         f = sys.stdin
 | |
|     else:
 | |
|         try:
 | |
|             f = open(file, 'r')
 | |
|         except IOError, msg:
 | |
|             print file, ":", msg
 | |
|             sys.exit(1)
 | |
| 
 | |
|     data = f.read()
 | |
| 
 | |
|     if f is not sys.stdin:
 | |
|         f.close()
 | |
| 
 | |
|     if silent:
 | |
|         f = formatter.NullFormatter()
 | |
|     else:
 | |
|         f = formatter.AbstractFormatter(formatter.DumbWriter())
 | |
| 
 | |
|     p = HTMLParser(f)
 | |
|     p.feed(data)
 | |
|     p.close()
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     test()
 | 
