mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			221 lines
		
	
	
	
		
			9.4 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
			
		
		
	
	
			221 lines
		
	
	
	
		
			9.4 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
| \section{Standard Module \sectcode{xmllib}}
 | |
| % Author: Sjoerd Mullender
 | |
| \label{module-xmllib}
 | |
| \stmodindex{xmllib}
 | |
| \index{XML}
 | |
| 
 | |
| This module defines a class \code{XMLParser} which serves as the basis 
 | |
| for parsing text files formatted in XML (eXtended Markup Language).
 | |
| 
 | |
| The \code{XMLParser} class must be instantiated without arguments.  It 
 | |
| has the following interface methods:
 | |
| 
 | |
| \renewcommand{\indexsubitem}{(XMLParser method)}
 | |
| 
 | |
| \begin{funcdesc}{reset}{}
 | |
| Reset the instance.  Loses all unprocessed data.  This is called
 | |
| implicitly at the instantiation time.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{setnomoretags}{}
 | |
| Stop processing tags.  Treat all following input as literal input
 | |
| (CDATA).
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{setliteral}{}
 | |
| Enter literal mode (CDATA mode).
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{feed}{data}
 | |
| Feed some text to the parser.  It is processed insofar as it consists
 | |
| of complete elements; incomplete data is buffered until more data is
 | |
| fed or \code{close()} is called.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{close}{}
 | |
| Force processing of all buffered data as if it were followed by an
 | |
| end-of-file mark.  This method may be redefined by a derived class to
 | |
| define additional processing at the end of the input, but the
 | |
| redefined version should always call \code{XMLParser.close()}.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{translate_references}{data}
 | |
| Translate all entity and character references in \code{data} and
 | |
| returns the translated string.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_xml}{encoding\, standalone}
 | |
| This method is called when the \code{<?xml ...?>} tag is processed.
 | |
| The arguments are the values of the encoding and standalone attributes 
 | |
| in the tag.  Both encoding and standalone are optional.  The values
 | |
| passed to \code{handle_xml} default to \code{None} and the string
 | |
| \code{'no'} respectively.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_doctype}{tag\, data}
 | |
| This method is called when the \code{<!DOCTYPE...>} tag is processed.
 | |
| The arguments are the name of the root element and the uninterpreted
 | |
| contents of the tag, starting after the white space after the name of
 | |
| the root element.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_starttag}{tag\, method\, attributes}
 | |
| This method is called to handle start tags for which a
 | |
| \code{start_\var{tag}()} method has been defined.  The \code{tag}
 | |
| argument is the name of the tag, and the \code{method} argument is the
 | |
| bound method which should be used to support semantic interpretation
 | |
| of the start tag.  The \var{attributes} argument is a dictionary of
 | |
| attributes, the key being the \var{name} and the value being the
 | |
| \var{value} of the attribute found inside the tag's \code{<>} brackets.
 | |
| Character and entity references in the \var{value} have
 | |
| been interpreted.  For instance, for the tag
 | |
| \code{<A HREF="http://www.cwi.nl/">}, this method would be called as
 | |
| \code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}.
 | |
| The base implementation simply calls \code{method} with \code{attributes}
 | |
| as the only argument.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_endtag}{tag\, method}
 | |
| This method is called to handle endtags for which an
 | |
| \code{end_\var{tag}()} method has been defined.  The \code{tag}
 | |
| argument is the name of the tag, and the
 | |
| \code{method} argument is the bound method which should be used to
 | |
| support semantic interpretation of the end tag.  If no
 | |
| \code{end_\var{tag}()} method is defined for the closing element, this
 | |
| handler is not called.  The base implementation simply calls
 | |
| \code{method}.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_data}{data}
 | |
| This method is called to process arbitrary data.  It is intended to be
 | |
| overridden by a derived class; the base class implementation does
 | |
| nothing.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_charref}{ref}
 | |
| This method is called to process a character reference of the form
 | |
| ``\code{\&\#\var{ref};}''.  \var{ref} can either be a decimal number,
 | |
| or a hexadecimal number when preceded by \code{x}.
 | |
| In the base implementation, \var{ref} must be a number in the
 | |
| range 0-255.  It translates the character to \ASCII{} and calls the
 | |
| method \code{handle_data()} with the character as argument.  If
 | |
| \var{ref} is invalid or out of range, the method
 | |
| \code{unknown_charref(\var{ref})} is called to handle the error.  A
 | |
| subclass must override this method to provide support for character
 | |
| references outside of the \ASCII{} range.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_entityref}{ref}
 | |
| This method is called to process a general entity reference of the form
 | |
| ``\code{\&\var{ref};}'' where \var{ref} is an general entity
 | |
| reference.  It looks for \var{ref} in the instance (or class)
 | |
| variable \code{entitydefs} which should be a mapping from entity names
 | |
| to corresponding translations.
 | |
| If a translation is found, it calls the method \code{handle_data()}
 | |
| with the translation; otherwise, it calls the method
 | |
| \code{unknown_entityref(\var{ref})}.  The default \code{entitydefs}
 | |
| defines translations for \code{\&}, \code{\&apos}, \code{\>},
 | |
| \code{\<}, and \code{\"}.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_comment}{comment}
 | |
| This method is called when a comment is encountered.  The
 | |
| \code{comment} argument is a string containing the text between the
 | |
| ``\code{<!--}'' and ``\code{-->}'' delimiters, but not the delimiters
 | |
| themselves.  For example, the comment ``\code{<!--text-->}'' will
 | |
| cause this method to be called with the argument \code{'text'}.  The
 | |
| default method does nothing.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_cdata}{data}
 | |
| This method is called when a CDATA element is encountered.  The
 | |
| \code{data} argument is a string containing the text between the
 | |
| ``\code{<![CDATA[}'' and ``\code{]]>}'' delimiters, but not the delimiters
 | |
| themselves.  For example, the entity ``\code{<![CDATA[text]]>}'' will
 | |
| cause this method to be called with the argument \code{'text'}.  The
 | |
| default method does nothing.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_proc}{name\, data}
 | |
| This method is called when a processing instruction (PI) is encountered.  The
 | |
| \code{name} is the PI target, and the \code{data} argument is a
 | |
| string containing the text between the PI target and the closing delimiter,
 | |
| but not the delimiter itself.  For example, the instruction
 | |
| ``\code{<?XML text?>}'' will cause this method to be called with the
 | |
| arguments \code{'XML'} and \code{'text'}.  The default method does
 | |
| nothing.  Note that if a document starts with a \code{<?xml ...?>}
 | |
| tag, \code{handle_xml} is called to handle it.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{handle_special}{data}
 | |
| This method is called when a declaration is encountered.  The
 | |
| \code{data} argument is a string containing the text between the
 | |
| ``\code{<!}'' and ``\code{>}'' delimiters, but not the delimiters
 | |
| themselves.  For example, the entity ``\code{<!ENTITY text>}'' will
 | |
| cause this method to be called with the argument \code{'ENTITY text'}.  The
 | |
| default method does nothing.  Note that \code{<!DOCTYPE ...>} is
 | |
| handled separately if it is located at the start of the document.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{syntax_error}{message}
 | |
| This method is called when a syntax error is encountered.  The
 | |
| \code{message} is a description of what was wrong.  The default method 
 | |
| raises a \code{RuntimeError} exception.  If this method is overridden, 
 | |
| it is permissable for it to return.  This method is only called when
 | |
| the error can be recovered from.  Unrecoverable errors raise a
 | |
| \code{RuntimeError} without first calling \code{syntax_error}.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{unknown_starttag}{tag\, attributes}
 | |
| This method is called to process an unknown start tag.  It is intended
 | |
| to be overridden by a derived class; the base class implementation
 | |
| does nothing.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{unknown_endtag}{tag}
 | |
| This method is called to process an unknown end tag.  It is intended
 | |
| to be overridden by a derived class; the base class implementation
 | |
| does nothing.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{unknown_charref}{ref}
 | |
| This method is called to process unresolvable numeric character
 | |
| references.  It is intended to be overridden by a derived class; the
 | |
| base class implementation does nothing.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{unknown_entityref}{ref}
 | |
| This method is called to process an unknown entity reference.  It is
 | |
| intended to be overridden by a derived class; the base class
 | |
| implementation does nothing.
 | |
| \end{funcdesc}
 | |
| 
 | |
| Apart from overriding or extending the methods listed above, derived
 | |
| classes may also define methods and variables of the following form to
 | |
| define processing of specific tags.  Tag names in the input stream are
 | |
| case dependent; the \var{tag} occurring in method names must be in the
 | |
| correct case:
 | |
| 
 | |
| \begin{funcdesc}{start_\var{tag}}{attributes}
 | |
| This method is called to process an opening tag \var{tag}.  The
 | |
| \var{attributes} argument has the same meaning as described for
 | |
| \code{handle_starttag()} above.  In fact, the base implementation of
 | |
| \code{handle_starttag} calls this method.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{end_\var{tag}}{}
 | |
| This method is called to process a closing tag \var{tag}.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{datadesc}{\var{tag}_attributes}
 | |
| If a class or instance variable \code{\var{tag}_attributes} exists, it 
 | |
| should be a list or a dictionary.  If a list, the elements of the list 
 | |
| are the valid attributes for the element \var{tag}; if a dictionary,
 | |
| the keys are the valid attributes for the element \var{tag}, and the
 | |
| values the default values of the attributes, or \code{None} if there
 | |
| is no default.
 | |
| In addition to the attributes that were present in the tag, the
 | |
| attribute dictionary that is passed to \code{handle_starttag} and
 | |
| \code{unknown_starttag} contains values for all attributes that have a
 | |
| default value.
 | |
| \end{datadesc}
 | 
