mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 05:01:30 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			142 lines
		
	
	
	
		
			5.5 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
			
		
		
	
	
			142 lines
		
	
	
	
		
			5.5 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
| \section{\module{HTMLParser} ---
 | |
|          Simple HTML and XHTML parser}
 | |
| 
 | |
| \declaremodule{standard}{HTMLParser}
 | |
| \modulesynopsis{A simple parser that can handle HTML and XHTML.}
 | |
| 
 | |
| This module defines a class \class{HTMLParser} which serves as the
 | |
| basis for parsing text files formatted in HTML\index{HTML} (HyperText
 | |
| Mark-up Language) and XHTML.\index{XHTML}  Unlike the parser in
 | |
| \refmodule{htmllib}, this parser is not based on the SGML parser in
 | |
| \refmodule{sgmllib}.
 | |
| 
 | |
| 
 | |
| \begin{classdesc}{HTMLParser}{}
 | |
| The \class{HTMLParser} class is instantiated without arguments.
 | |
| 
 | |
| An HTMLParser instance is fed HTML data and calls handler functions
 | |
| when tags begin and end.  The \class{HTMLParser} class is meant to be
 | |
| overridden by the user to provide a desired behavior.
 | |
| 
 | |
| Unlike the parser in \refmodule{htmllib}, this parser does not check
 | |
| that end tags match start tags or call the end-tag handler for
 | |
| elements which are closed implicitly by closing an outer element.
 | |
| \end{classdesc}
 | |
| 
 | |
| 
 | |
| \class{HTMLParser} instances have the following methods:
 | |
| 
 | |
| \begin{methoddesc}{reset}{}
 | |
| Reset the instance.  Loses all unprocessed data.  This is called
 | |
| implicitly at instantiation time.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{feed}{data}
 | |
| Feed some text to the parser.  It is processed insofar as it consists
 | |
| of complete elements; incomplete data is buffered until more data is
 | |
| fed or \method{close()} is called.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{close}{}
 | |
| Force processing of all buffered data as if it were followed by an
 | |
| end-of-file mark.  This method may be redefined by a derived class to
 | |
| define additional processing at the end of the input, but the
 | |
| redefined version should always call the \class{HTMLParser} base class
 | |
| method \method{close()}.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{getpos}{}
 | |
| Return current line number and offset.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{get_starttag_text}{}
 | |
| Return the text of the most recently opened start tag.  This should
 | |
| not normally be needed for structured processing, but may be useful in
 | |
| dealing with HTML ``as deployed'' or for re-generating input with
 | |
| minimal changes (whitespace between attributes can be preserved,
 | |
| etc.).
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{handle_starttag}{tag, attrs} 
 | |
| This method is called to handle the start of a tag.  It is intended to
 | |
| be overridden by a derived class; the base class implementation does
 | |
| nothing.  
 | |
| 
 | |
| The \var{tag} argument is the name of the tag converted to
 | |
| lower case.  The \var{attrs} argument is a list of \code{(\var{name},
 | |
| \var{value})} pairs containing the attributes found inside the tag's
 | |
| \code{<>} brackets.  The \var{name} will be translated to lower case
 | |
| and double quotes and backslashes in the \var{value} have been
 | |
| interpreted.  For instance, for the tag \code{<A
 | |
| HREF="http://www.cwi.nl/">}, this method would be called as
 | |
| \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{handle_startendtag}{tag, attrs}
 | |
| Similar to \method{handle_starttag()}, but called when the parser
 | |
| encounters an XHTML-style empty tag (\code{<a .../>}).  This method
 | |
| may be overridden by subclasses which require this particular lexical
 | |
| information; the default implementation simple calls
 | |
| \method{handle_starttag()} and \method{handle_endtag()}.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{handle_endtag}{tag}
 | |
| This method is called to handle the end tag of an element.  It is
 | |
| intended to be overridden by a derived class; the base class
 | |
| implementation does nothing.  The \var{tag} argument is the name of
 | |
| the tag converted to lower case.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{handle_data}{data}
 | |
| This method is called to process arbitrary data.  It is intended to be
 | |
| overridden by a derived class; the base class implementation does
 | |
| nothing.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{handle_charref}{name} This method is called to
 | |
| process a character reference of the form \samp{\&\#\var{ref};}.  It
 | |
| is intended to be overridden by a derived class; the base class
 | |
| implementation does nothing.  
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{handle_entityref}{name} 
 | |
| This method is called to process a general entity reference of the
 | |
| form \samp{\&\var{name};} where \var{name} is an general entity
 | |
| reference.  It is intended to be overridden by a derived class; the
 | |
| base class implementation does nothing.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{handle_comment}{data}
 | |
| This method is called when a comment is encountered.  The
 | |
| \var{comment} argument is a string containing the text between the
 | |
| \samp{<!--} and \samp{-->} delimiters, but not the delimiters
 | |
| themselves.  For example, the comment \samp{<!--text-->} will cause
 | |
| this method to be called with the argument \code{'text'}.  It is
 | |
| intended to be overridden by a derived class; the base class
 | |
| implementation does nothing.
 | |
| \end{methoddesc}
 | |
| 
 | |
| \begin{methoddesc}{handle_decl}{decl}
 | |
| Method called when an SGML declaration is read by the parser.  The
 | |
| \var{decl} parameter will be the entire contents of the declaration
 | |
| inside the \code{<!}...\code{>} markup.It is intended to be overridden
 | |
| by a derived class; the base class implementation does nothing.
 | |
| \end{methoddesc}
 | |
| 
 | |
| 
 | |
| \subsection{Example HTML Parser \label{htmlparser-example}}
 | |
| 
 | |
| As a basic example, below is a very basic HTML parser that uses the
 | |
| \class{HTMLParser} class to print out tags as they are encountered:
 | |
| 
 | |
| \begin{verbatim}
 | |
| from HTMLParser import HTMLParser
 | |
| 
 | |
| class MyHTMLParser(HTMLParser):
 | |
| 
 | |
|     def handle_starttag(self, tag, attrs):
 | |
|         print "Encountered the beginning of a %s tag" % tag
 | |
| 
 | |
|     def handle_endtag(self, tag):
 | |
|         print "Encountered the end of a %s tag" % tag
 | |
| \end{verbatim}
 | 
