mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	Close issue 3437 - missing state change when Allow lines are processed.
Adds test cases which use Allow: as well.
This commit is contained in:
		
							parent
							
								
									4b99e9b479
								
							
						
					
					
						commit
						1ef19f0de1
					
				
					 2 changed files with 74 additions and 0 deletions
				
			
		|  | @ -76,6 +76,10 @@ def parse(self, lines): | |||
|         """parse the input lines from a robots.txt file. | ||||
|            We allow that a user-agent: line is not preceded by | ||||
|            one or more blank lines.""" | ||||
|         # states: | ||||
|         #   0: start state | ||||
|         #   1: saw user-agent line | ||||
|         #   2: saw an allow or disallow line | ||||
|         state = 0 | ||||
|         linenumber = 0 | ||||
|         entry = Entry() | ||||
|  | @ -114,6 +118,7 @@ def parse(self, lines): | |||
|                 elif line[0] == "allow": | ||||
|                     if state != 0: | ||||
|                         entry.rulelines.append(RuleLine(line[1], True)) | ||||
|                         state = 2 | ||||
|         if state == 2: | ||||
|             self.entries.append(entry) | ||||
| 
 | ||||
|  |  | |||
|  | @ -134,6 +134,75 @@ def RobotTest(index, robots_txt, good_urls, bad_urls, | |||
| 
 | ||||
| RobotTest(7, doc, good, bad) | ||||
| 
 | ||||
| # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364 | ||||
| 
 | ||||
| # 8. | ||||
| doc = """ | ||||
| User-agent: Googlebot | ||||
| Allow: /folder1/myfile.html | ||||
| Disallow: /folder1/ | ||||
| """ | ||||
| 
 | ||||
| good = ['/folder1/myfile.html'] | ||||
| bad = ['/folder1/anotherfile.html'] | ||||
| 
 | ||||
| RobotTest(8, doc, good, bad, agent="Googlebot") | ||||
| 
 | ||||
| # 9.  This file is incorrect because "Googlebot" is a substring of | ||||
| #     "Googlebot-Mobile", so test 10 works just like test 9. | ||||
| doc = """ | ||||
| User-agent: Googlebot | ||||
| Disallow: / | ||||
| 
 | ||||
| User-agent: Googlebot-Mobile | ||||
| Allow: / | ||||
| """ | ||||
| 
 | ||||
| good = [] | ||||
| bad = ['/something.jpg'] | ||||
| 
 | ||||
| RobotTest(9, doc, good, bad, agent="Googlebot") | ||||
| 
 | ||||
| good = [] | ||||
| bad = ['/something.jpg'] | ||||
| 
 | ||||
| RobotTest(10, doc, good, bad, agent="Googlebot-Mobile") | ||||
| 
 | ||||
| # 11.  Get the order correct. | ||||
| doc = """ | ||||
| User-agent: Googlebot-Mobile | ||||
| Allow: / | ||||
| 
 | ||||
| User-agent: Googlebot | ||||
| Disallow: / | ||||
| """ | ||||
| 
 | ||||
| good = [] | ||||
| bad = ['/something.jpg'] | ||||
| 
 | ||||
| RobotTest(11, doc, good, bad, agent="Googlebot") | ||||
| 
 | ||||
| good = ['/something.jpg'] | ||||
| bad = [] | ||||
| 
 | ||||
| RobotTest(12, doc, good, bad, agent="Googlebot-Mobile") | ||||
| 
 | ||||
| 
 | ||||
| # 13.  Google also got the order wrong in #8.  You need to specify the | ||||
| #      URLs from more specific to more general. | ||||
| doc = """ | ||||
| User-agent: Googlebot | ||||
| Allow: /folder1/myfile.html | ||||
| Disallow: /folder1/ | ||||
| """ | ||||
| 
 | ||||
| good = ['/folder1/myfile.html'] | ||||
| bad = ['/folder1/anotherfile.html'] | ||||
| 
 | ||||
| RobotTest(13, doc, good, bad, agent="googlebot") | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| class TestCase(unittest.TestCase): | ||||
|     def runTest(self): | ||||
|         test_support.requires('network') | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Skip Montanaro
						Skip Montanaro