| 
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 |  |  | \section{\module{robotparser} ---  | 
					
						
							| 
									
										
										
										
											2000-04-28 18:17:23 +00:00
										 |  |  |          Parser for robots.txt} | 
					
						
							| 
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | \declaremodule{standard}{robotparser} | 
					
						
							| 
									
										
										
										
											2001-11-06 22:14:35 +00:00
										 |  |  | \modulesynopsis{Loads a \protect\file{robots.txt} file and | 
					
						
							|  |  |  |                 answers questions about fetchability of other URLs.} | 
					
						
							| 
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 |  |  | \sectionauthor{Skip Montanaro}{skip@mojam.com} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \index{WWW} | 
					
						
							| 
									
										
										
										
											2001-07-14 02:50:55 +00:00
										 |  |  | \index{World Wide Web} | 
					
						
							| 
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 |  |  | \index{URL} | 
					
						
							|  |  |  | \index{robots.txt} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This module provides a single class, \class{RobotFileParser}, which answers | 
					
						
							|  |  |  | questions about whether or not a particular user agent can fetch a URL on | 
					
						
							| 
									
										
										
										
											2001-07-14 02:50:55 +00:00
										 |  |  | the Web site that published the \file{robots.txt} file.  For more details on  | 
					
						
							| 
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 |  |  | the structure of \file{robots.txt} files, see | 
					
						
							|  |  |  | \url{http://info.webcrawler.com/mak/projects/robots/norobots.html}.  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \begin{classdesc}{RobotFileParser}{} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This class provides a set of methods to read, parse and answer questions | 
					
						
							|  |  |  | about a single \file{robots.txt} file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \begin{methoddesc}{set_url}{url} | 
					
						
							|  |  |  | Sets the URL referring to a \file{robots.txt} file. | 
					
						
							|  |  |  | \end{methoddesc} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \begin{methoddesc}{read}{} | 
					
						
							|  |  |  | Reads the \file{robots.txt} URL and feeds it to the parser. | 
					
						
							|  |  |  | \end{methoddesc} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \begin{methoddesc}{parse}{lines} | 
					
						
							|  |  |  | Parses the lines argument. | 
					
						
							|  |  |  | \end{methoddesc} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \begin{methoddesc}{can_fetch}{useragent, url} | 
					
						
							| 
									
										
										
										
											2002-04-05 02:21:09 +00:00
										 |  |  | Returns \code{True} if the \var{useragent} is allowed to fetch the \var{url} | 
					
						
							| 
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 |  |  | according to the rules contained in the parsed \file{robots.txt} file. | 
					
						
							|  |  |  | \end{methoddesc} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \begin{methoddesc}{mtime}{} | 
					
						
							|  |  |  | Returns the time the \code{robots.txt} file was last fetched.  This is | 
					
						
							|  |  |  | useful for long-running web spiders that need to check for new | 
					
						
							|  |  |  | \code{robots.txt} files periodically. | 
					
						
							|  |  |  | \end{methoddesc} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \begin{methoddesc}{modified}{} | 
					
						
							|  |  |  | Sets the time the \code{robots.txt} file was last fetched to the current | 
					
						
							|  |  |  | time. | 
					
						
							|  |  |  | \end{methoddesc} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \end{classdesc} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The following example demonstrates basic use of the RobotFileParser class. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | \begin{verbatim} | 
					
						
							|  |  |  | >>> import robotparser | 
					
						
							|  |  |  | >>> rp = robotparser.RobotFileParser() | 
					
						
							|  |  |  | >>> rp.set_url("http://www.musi-cal.com/robots.txt") | 
					
						
							|  |  |  | >>> rp.read() | 
					
						
							|  |  |  | >>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco") | 
					
						
							| 
									
										
										
										
											2002-04-05 02:21:09 +00:00
										 |  |  | False | 
					
						
							| 
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 |  |  | >>> rp.can_fetch("*", "http://www.musi-cal.com/") | 
					
						
							| 
									
										
										
										
											2002-04-05 02:21:09 +00:00
										 |  |  | True | 
					
						
							| 
									
										
										
										
											2000-03-31 17:51:10 +00:00
										 |  |  | \end{verbatim} |