| 
									
										
										
										
											1998-04-04 07:23:21 +00:00
										 |  |  | \section{Standard Module \module{sgmllib}} | 
					
						
							| 
									
										
										
										
											1998-07-23 17:59:49 +00:00
										 |  |  | \declaremodule{standard}{sgmllib} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \modulesynopsis{Only as much of an SGML parser as needed to parse HTML.} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | \index{SGML} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | This module defines a class \class{SGMLParser} which serves as the | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | basis for parsing text files formatted in SGML (Standard Generalized | 
					
						
							|  |  |  | Mark-up Language).  In fact, it does not provide a full SGML parser | 
					
						
							| 
									
										
										
										
											1996-10-09 16:13:22 +00:00
										 |  |  | --- it only parses SGML insofar as it is used by HTML, and the module | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | only exists as a base for the \module{htmllib}\refstmodindex{htmllib} | 
					
						
							|  |  |  | module. | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | \begin{classdesc}{SGMLParser}{} | 
					
						
							|  |  |  | The \class{SGMLParser} class is instantiated without arguments. | 
					
						
							|  |  |  | The parser is hardcoded to recognize the following | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | constructs: | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | \begin{itemize} | 
					
						
							|  |  |  | \item | 
					
						
							|  |  |  | Opening and closing tags of the form | 
					
						
							| 
									
										
										
										
											1998-02-13 14:37:12 +00:00
										 |  |  | \samp{<\var{tag} \var{attr}="\var{value}" ...>} and | 
					
						
							|  |  |  | \samp{</\var{tag}>}, respectively. | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | \item | 
					
						
							| 
									
										
										
										
											1998-02-13 14:37:12 +00:00
										 |  |  | Numeric character references of the form \samp{\&\#\var{name};}. | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | \item | 
					
						
							| 
									
										
										
										
											1998-02-13 14:37:12 +00:00
										 |  |  | Entity references of the form \samp{\&\var{name};}. | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | \item | 
					
						
							| 
									
										
										
										
											1998-02-13 14:37:12 +00:00
										 |  |  | SGML comments of the form \samp{<!--\var{text}-->}.  Note that | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | spaces, tabs, and newlines are allowed between the trailing | 
					
						
							| 
									
										
										
										
											1998-02-13 14:37:12 +00:00
										 |  |  | \samp{>} and the immediately preceeding \samp{--}. | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | \end{itemize} | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | \end{classdesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | \class{SGMLParser} instances have the following interface methods: | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1996-10-09 16:13:22 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{reset}{} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | Reset the instance.  Loses all unprocessed data.  This is called | 
					
						
							|  |  |  | implicitly at instantiation time. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{setnomoretags}{} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | Stop processing tags.  Treat all following input as literal input | 
					
						
							|  |  |  | (CDATA).  (This is only provided so the HTML tag \code{<PLAINTEXT>} | 
					
						
							|  |  |  | can be implemented.) | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{setliteral}{} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | Enter literal mode (CDATA mode). | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{feed}{data} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | Feed some text to the parser.  It is processed insofar as it consists | 
					
						
							|  |  |  | of complete elements; incomplete data is buffered until more data is | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | fed or \method{close()} is called. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{close}{} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | Force processing of all buffered data as if it were followed by an | 
					
						
							|  |  |  | end-of-file mark.  This method may be redefined by a derived class to | 
					
						
							|  |  |  | define additional processing at the end of the input, but the | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | redefined version should always call \method{close()}. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{handle_starttag}{tag, method, attributes} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | This method is called to handle start tags for which either a | 
					
						
							|  |  |  | \code{start_\var{tag}()} or \code{do_\var{tag}()} method has been | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | defined.  The \var{tag} argument is the name of the tag converted to | 
					
						
							|  |  |  | lower case, and the \var{method} argument is the bound method which | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | should be used to support semantic interpretation of the start tag. | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | The \var{attributes} argument is a list of \code{(\var{name}, \var{value})} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | pairs containing the attributes found inside the tag's \code{<>} | 
					
						
							|  |  |  | brackets.  The \var{name} has been translated to lower case and double | 
					
						
							|  |  |  | quotes and backslashes in the \var{value} have been interpreted.  For | 
					
						
							|  |  |  | instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | method would be called as \samp{unknown_starttag('a', [('href', | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | 'http://www.cwi.nl/')])}.  The base implementation simply calls | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | \var{method} with \var{attributes} as the only argument. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{handle_endtag}{tag, method} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | This method is called to handle endtags for which an | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | \code{end_\var{tag}()} method has been defined.  The \var{tag} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | argument is the name of the tag converted to lower case, and the | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | \var{method} argument is the bound method which should be used to | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | support semantic interpretation of the end tag.  If no | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | \code{end_\var{tag}()} method is defined for the closing element, | 
					
						
							|  |  |  | this handler is not called.  The base implementation simply calls | 
					
						
							|  |  |  | \var{method}. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{handle_data}{data} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | This method is called to process arbitrary data.  It is intended to be | 
					
						
							|  |  |  | overridden by a derived class; the base class implementation does | 
					
						
							|  |  |  | nothing. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{handle_charref}{ref} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | This method is called to process a character reference of the form | 
					
						
							| 
									
										
										
										
											1998-02-13 14:37:12 +00:00
										 |  |  | \samp{\&\#\var{ref};}.  In the base implementation, \var{ref} must | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | be a decimal number in the | 
					
						
							| 
									
										
										
										
											1995-03-17 16:07:09 +00:00
										 |  |  | range 0-255.  It translates the character to \ASCII{} and calls the | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | method \method{handle_data()} with the character as argument.  If | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | \var{ref} is invalid or out of range, the method | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | \code{unknown_charref(\var{ref})} is called to handle the error.  A | 
					
						
							|  |  |  | subclass must override this method to provide support for named | 
					
						
							|  |  |  | character entities. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{handle_entityref}{ref} | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | This method is called to process a general entity reference of the | 
					
						
							|  |  |  | form \samp{\&\var{ref};} where \var{ref} is an general entity | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | reference.  It looks for \var{ref} in the instance (or class) | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | variable \member{entitydefs} which should be a mapping from entity | 
					
						
							|  |  |  | names to corresponding translations. | 
					
						
							|  |  |  | If a translation is found, it calls the method \method{handle_data()} | 
					
						
							| 
									
										
										
										
											1995-03-07 10:14:09 +00:00
										 |  |  | with the translation; otherwise, it calls the method | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | \code{unknown_entityref(\var{ref})}.  The default \member{entitydefs} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | defines translations for \code{\&}, \code{\&apos}, \code{\>}, | 
					
						
							|  |  |  | \code{\<}, and \code{\"}. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{handle_comment}{comment} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | This method is called when a comment is encountered.  The | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | \var{comment} argument is a string containing the text between the | 
					
						
							| 
									
										
										
										
											1998-02-13 14:37:12 +00:00
										 |  |  | \samp{<!--} and \samp{-->} delimiters, but not the delimiters | 
					
						
							|  |  |  | themselves.  For example, the comment \samp{<!--text-->} will | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | cause this method to be called with the argument \code{'text'}.  The | 
					
						
							|  |  |  | default method does nothing. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{report_unbalanced}{tag} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | This method is called when an end tag is found which does not | 
					
						
							|  |  |  | correspond to any open element. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{unknown_starttag}{tag, attributes} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | This method is called to process an unknown start tag.  It is intended | 
					
						
							|  |  |  | to be overridden by a derived class; the base class implementation | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | does nothing. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{unknown_endtag}{tag} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | This method is called to process an unknown end tag.  It is intended | 
					
						
							|  |  |  | to be overridden by a derived class; the base class implementation | 
					
						
							|  |  |  | does nothing. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{unknown_charref}{ref} | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | This method is called to process unresolvable numeric character | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | references.  Refer to \method{handle_charref()} to determine what is | 
					
						
							|  |  |  | handled by default.  It is intended to be overridden by a derived | 
					
						
							|  |  |  | class; the base class implementation does nothing. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddesc}{unknown_entityref}{ref} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | This method is called to process an unknown entity reference.  It is | 
					
						
							|  |  |  | intended to be overridden by a derived class; the base class | 
					
						
							|  |  |  | implementation does nothing. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddesc} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | Apart from overriding or extending the methods listed above, derived | 
					
						
							|  |  |  | classes may also define methods of the following form to define | 
					
						
							|  |  |  | processing of specific tags.  Tag names in the input stream are case | 
					
						
							|  |  |  | independent; the \var{tag} occurring in method names must be in lower | 
					
						
							|  |  |  | case: | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddescni}{start_\var{tag}}{attributes} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | This method is called to process an opening tag \var{tag}.  It has | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | preference over \code{do_\var{tag}()}.  The \var{attributes} | 
					
						
							|  |  |  | argument has the same meaning as described for | 
					
						
							|  |  |  | \method{handle_starttag()} above. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddescni} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddescni}{do_\var{tag}}{attributes} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | This method is called to process an opening tag \var{tag} that does | 
					
						
							|  |  |  | not come with a matching closing tag.  The \var{attributes} argument | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | has the same meaning as described for \method{handle_starttag()} above. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddescni} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \begin{methoddescni}{end_\var{tag}}{} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | This method is called to process a closing tag \var{tag}. | 
					
						
							| 
									
										
										
										
											1998-03-27 05:27:08 +00:00
										 |  |  | \end{methoddescni} | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | Note that the parser maintains a stack of open elements for which no | 
					
						
							|  |  |  | end tag has been found yet.  Only tags processed by | 
					
						
							|  |  |  | \code{start_\var{tag}()} are pushed on this stack.  Definition of an | 
					
						
							| 
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 |  |  | \code{end_\var{tag}()} method is optional for these tags.  For tags | 
					
						
							| 
									
										
										
										
											1998-03-12 14:42:23 +00:00
										 |  |  | processed by \code{do_\var{tag}()} or by \method{unknown_tag()}, no | 
					
						
							| 
									
										
										
										
											1996-10-08 21:51:49 +00:00
										 |  |  | \code{end_\var{tag}()} method must be defined; if defined, it will not | 
					
						
							|  |  |  | be used.  If both \code{start_\var{tag}()} and \code{do_\var{tag}()} | 
					
						
							|  |  |  | methods exist for a tag, the \code{start_\var{tag}()} method takes | 
					
						
							|  |  |  | precedence. |