2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\section{\module{robotparser} --- 
							 | 
						
					
						
							
								
									
										
										
										
											2000-04-28 18:17:23 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								         Parser for robots.txt}
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\declaremodule{standard}{robotparser}
							 | 
						
					
						
							
								
									
										
										
										
											2001-11-06 22:14:35 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\modulesynopsis{Loads a \protect\file{robots.txt} file and
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                answers questions about fetchability of other URLs.}
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\sectionauthor{Skip Montanaro}{skip@mojam.com}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{WWW}
							 | 
						
					
						
							
								
									
										
										
										
											2001-07-14 02:50:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\index{World Wide Web}
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{URL}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\index{robots.txt}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								This module provides a single class, \class{RobotFileParser}, which answers
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								questions about whether or not a particular user agent can fetch a URL on
							 | 
						
					
						
							
								
									
										
										
										
											2001-07-14 02:50:55 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								the Web site that published the \file{robots.txt} file.  For more details on 
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								the structure of \file{robots.txt} files, see
							 | 
						
					
						
							
								
									
										
										
										
											2003-07-14 17:04:50 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								\url{http://www.robotstxt.org/wc/norobots.html}. 
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{classdesc}{RobotFileParser}{}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								This class provides a set of methods to read, parse and answer questions
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								about a single \file{robots.txt} file.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{methoddesc}{set_url}{url}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Sets the URL referring to a \file{robots.txt} file.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{methoddesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{methoddesc}{read}{}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Reads the \file{robots.txt} URL and feeds it to the parser.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{methoddesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{methoddesc}{parse}{lines}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Parses the lines argument.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{methoddesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{methoddesc}{can_fetch}{useragent, url}
							 | 
						
					
						
							
								
									
										
										
										
											2002-04-05 02:21:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								Returns \code{True} if the \var{useragent} is allowed to fetch the \var{url}
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								according to the rules contained in the parsed \file{robots.txt} file.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{methoddesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{methoddesc}{mtime}{}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Returns the time the \code{robots.txt} file was last fetched.  This is
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								useful for long-running web spiders that need to check for new
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\code{robots.txt} files periodically.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{methoddesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{methoddesc}{modified}{}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								Sets the time the \code{robots.txt} file was last fetched to the current
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								time.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{methoddesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{classdesc}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								The following example demonstrates basic use of the RobotFileParser class.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\begin{verbatim}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								>>> import robotparser
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								>>> rp = robotparser.RobotFileParser()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								>>> rp.set_url("http://www.musi-cal.com/robots.txt")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								>>> rp.read()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
							 | 
						
					
						
							
								
									
										
										
										
											2002-04-05 02:21:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								False
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								>>> rp.can_fetch("*", "http://www.musi-cal.com/")
							 | 
						
					
						
							
								
									
										
										
										
											2002-04-05 02:21:09 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								True
							 | 
						
					
						
							
								
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								\end{verbatim}
							 |