1995-03-17 16:07:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\section{Standard Module \sectcode{urllib}}
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\stmodindex{urllib}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{WWW}
							 | 
						
					
						
							
								
									
										
										
										
											1995-03-17 16:07:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\index{World-Wide Web}
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-27 17:51:51 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\index{URL}
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\renewcommand{\indexsubitem}{(in module urllib)}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								This module provides a high-level interface for fetching data across
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								the World-Wide Web.  In particular, the \code{urlopen} function is
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								similar to the built-in function \code{open}, but accepts URLs
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								(Universal Resource Locators) instead of filenames.  Some restrictions
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								apply --- it can only open URLs for reading, and no seek operations
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								are available.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								it defines the following public functions:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{funcdesc}{urlopen}{url}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Open a network object denoted by a URL for reading.  If the URL does
							 | 
						
					
						
							
								
									
										
										
										
											1995-03-17 16:07:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								not have a scheme identifier, or if it has \samp{file:} as its scheme
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								identifier, this opens a local file; otherwise it opens a socket to a
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								server somewhere on the network.  If the connection cannot be made, or
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								if the server returns an error code, the \code{IOError} exception is
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								raised.  If all went well, a file-like object is returned.  This
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								supports the following methods: \code{read()}, \code{readline()},
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\code{readlines()}, \code{fileno()}, \code{close()} and \code{info()}.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Except for the last one, these methods have the same interface as for
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								file objects --- see the section on File Objects earlier in this
							 | 
						
					
						
							
								
									
										
										
										
											1995-03-17 16:07:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								manual.  (It's not a built-in file object, however, so it can't be
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								used at those few places where a true built-in file object is
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								required.)
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								The \code{info()} method returns an instance of the class
							 | 
						
					
						
							
								
									
										
										
										
											1997-06-02 17:34:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\code{mimetools.Message} containing the headers received from the server,
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								if the protocol uses such headers (currently the only supported
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								protocol that uses this is HTTP).  See the description of the
							 | 
						
					
						
							
								
									
										
										
										
											1997-06-02 17:34:22 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\code{mimetools} module.
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{funcdesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{funcdesc}{urlretrieve}{url}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Copy a network object denoted by a URL to a local file, if necessary.
							 | 
						
					
						
							
								
									
										
										
										
											1995-03-07 10:14:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								If the URL points to a local file, or a valid cached copy of the
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								object exists, the object is not copied.  Return a tuple (\var{filename},
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\var{headers}) where \var{filename} is the local file name under which
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								the object can be found, and \var{headers} is either \code{None} (for
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								a local object) or whatever the \code{info()} method of the object
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								returned by \code{urlopen()} returned (for a remote object, possibly
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								cached).  Exceptions are the same as for \code{urlopen()}.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{funcdesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{funcdesc}{urlcleanup}{}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Clear the cache that may have been built up by previous calls to
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\code{urlretrieve()}.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{funcdesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-27 17:51:51 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\begin{funcdesc}{quote}{string\optional{\, addsafe}}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Replace special characters in \var{string} using the \code{\%xx} escape.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Letters, digits, and the characters ``\code{_,.-}'' are never quoted.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								The optional \var{addsafe} parameter specifies additional characters
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								that should not be quoted --- its default value is \code{'/'}.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1996-12-13 14:48:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								Example: \code{quote('/\~connolly/')} yields \code{'/\%7econnolly/'}.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{funcdesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{funcdesc}{quote_plus}{string\optional{\, addsafe}}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Like \code{quote()}, but also replaces spaces by plus signs, as
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								required for quoting HTML form values.
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-27 17:51:51 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\end{funcdesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{funcdesc}{unquote}{string}
							 | 
						
					
						
							
								
									
										
										
										
											1995-03-07 10:14:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								Replace \samp{\%xx} escapes by their single-character equivalent.
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-27 17:51:51 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-28 17:14:32 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								Example: \code{unquote('/\%7Econnolly/')} yields \code{'/\~connolly/'}.
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-27 17:51:51 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\end{funcdesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1996-12-13 14:48:47 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\begin{funcdesc}{unquote_plus}{string}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Like \code{unquote()}, but also replaces plus signs by spaces, as
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								required for unquoting HTML form values.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{funcdesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Restrictions:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{itemize}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Currently, only the following protocols are supported: HTTP, (versions
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								0.9 and 1.0), Gopher (but not Gopher-+), FTP, and local files.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{HTTP}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{Gopher}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{FTP}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								The caching feature of \code{urlretrieve()} has been disabled until I
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								find the time to hack proper processing of Expiration time headers.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\item
							 | 
						
					
						
							
								
									
										
										
										
											1995-03-07 10:14:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								There should be a function to query whether a particular URL is in
							 | 
						
					
						
							
								
									
										
										
										
											1995-02-16 16:29:46 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								the cache.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								For backward compatibility, if a URL appears to point to a local file
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								but the file can't be opened, the URL is re-interpreted using the FTP
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								protocol.  This can sometimes cause confusing error messages.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								The \code{urlopen()} and \code{urlretrieve()} functions can cause
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								arbitrarily long delays while waiting for a network connection to be
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								set up.  This means that it is difficult to build an interactive
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								web client using these functions without using threads.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								The data returned by \code{urlopen()} or \code{urlretrieve()} is the
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								raw data returned by the server.  This may be binary data (e.g. an
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								image), plain text or (for example) HTML.  The HTTP protocol provides
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								type information in the reply header, which can be inspected by
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								looking at the \code{Content-type} header.  For the Gopher protocol,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								type information is encoded in the URL; there is currently no easy way
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								to extract it.  If the returned data is HTML, you can use the module
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\code{htmllib} to parse it.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{HTML}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{HTTP}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{Gopher}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\stmodindex{htmllib}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\item
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Although the \code{urllib} module contains (undocumented) routines to
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								parse and unparse URL strings, the recommended interface for URL
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								manipulation is in module \code{urlparse}.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\stmodindex{urlparse}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{itemize}
							 |