| 
									
										
										
										
											2010-10-15 15:57:45 +00:00
										 |  |  |  | """
 | 
					
						
							|  |  |  |  | Tests for the html module functions. | 
					
						
							|  |  |  |  | """
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | import html | 
					
						
							|  |  |  |  | import unittest | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | class HtmlTests(unittest.TestCase): | 
					
						
							|  |  |  |  |     def test_escape(self): | 
					
						
							|  |  |  |  |         self.assertEqual( | 
					
						
							|  |  |  |  |             html.escape('\'<script>"&foo;"</script>\''), | 
					
						
							|  |  |  |  |             ''<script>"&foo;"</script>'') | 
					
						
							|  |  |  |  |         self.assertEqual( | 
					
						
							|  |  |  |  |             html.escape('\'<script>"&foo;"</script>\'', False), | 
					
						
							|  |  |  |  |             '\'<script>"&foo;"</script>\'') | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-19 20:28:45 +02:00
										 |  |  |  |     def test_unescape(self): | 
					
						
							|  |  |  |  |         numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;'] | 
					
						
							|  |  |  |  |         errmsg = 'unescape(%r) should have returned %r' | 
					
						
							|  |  |  |  |         def check(text, expected): | 
					
						
							|  |  |  |  |             self.assertEqual(html.unescape(text), expected, | 
					
						
							|  |  |  |  |                              msg=errmsg % (text, expected)) | 
					
						
							|  |  |  |  |         def check_num(num, expected): | 
					
						
							|  |  |  |  |             for format in numeric_formats: | 
					
						
							|  |  |  |  |                 text = format % num | 
					
						
							|  |  |  |  |                 self.assertEqual(html.unescape(text), expected, | 
					
						
							|  |  |  |  |                                  msg=errmsg % (text, expected)) | 
					
						
							|  |  |  |  |         # check text with no character references | 
					
						
							|  |  |  |  |         check('no character references', 'no character references') | 
					
						
							|  |  |  |  |         # check & followed by invalid chars | 
					
						
							|  |  |  |  |         check('&\n&\t& &&', '&\n&\t& &&') | 
					
						
							|  |  |  |  |         # check & followed by numbers and letters | 
					
						
							|  |  |  |  |         check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;') | 
					
						
							|  |  |  |  |         # check incomplete entities at the end of the string | 
					
						
							|  |  |  |  |         for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']: | 
					
						
							|  |  |  |  |             check(x, x) | 
					
						
							|  |  |  |  |             check(x+';', x+';') | 
					
						
							|  |  |  |  |         # check several combinations of numeric character references, | 
					
						
							|  |  |  |  |         # possibly followed by different characters | 
					
						
							|  |  |  |  |         formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;', | 
					
						
							|  |  |  |  |                    '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;', | 
					
						
							|  |  |  |  |                    '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;'] | 
					
						
							|  |  |  |  |         for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234], | 
					
						
							|  |  |  |  |                              ['A', 'a', '"', '&', '\u2603', '\U00101234']): | 
					
						
							|  |  |  |  |             for s in formats: | 
					
						
							|  |  |  |  |                 check(s % num, char) | 
					
						
							|  |  |  |  |                 for end in [' ', 'X']: | 
					
						
							|  |  |  |  |                     check((s+end) % num, char+end) | 
					
						
							| 
									
										
										
										
											2015-01-18 11:28:37 +02:00
										 |  |  |  |         # check invalid code points | 
					
						
							| 
									
										
										
										
											2013-11-19 20:28:45 +02:00
										 |  |  |  |         for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]: | 
					
						
							|  |  |  |  |             check_num(cp, '\uFFFD') | 
					
						
							| 
									
										
										
										
											2015-01-18 11:28:37 +02:00
										 |  |  |  |         # check more invalid code points | 
					
						
							| 
									
										
										
										
											2013-11-19 20:28:45 +02:00
										 |  |  |  |         for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]: | 
					
						
							|  |  |  |  |             check_num(cp, '') | 
					
						
							|  |  |  |  |         # check invalid numbers | 
					
						
							|  |  |  |  |         for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'): | 
					
						
							|  |  |  |  |             check_num(num, ch) | 
					
						
							|  |  |  |  |         # check small numbers | 
					
						
							|  |  |  |  |         check_num(0, '\uFFFD') | 
					
						
							|  |  |  |  |         check_num(9, '\t') | 
					
						
							|  |  |  |  |         # check a big number | 
					
						
							|  |  |  |  |         check_num(1000000000000000000, '\uFFFD') | 
					
						
							|  |  |  |  |         # check that multiple trailing semicolons are handled correctly | 
					
						
							|  |  |  |  |         for e in ['";', '";', '";', '";']: | 
					
						
							|  |  |  |  |             check(e, '";') | 
					
						
							|  |  |  |  |         # check that semicolons in the middle don't create problems | 
					
						
							|  |  |  |  |         for e in ['"quot;', '"quot;', '"quot;', '"quot;']: | 
					
						
							|  |  |  |  |             check(e, '"quot;') | 
					
						
							|  |  |  |  |         # check triple adjacent charrefs | 
					
						
							|  |  |  |  |         for e in ['"', '"', '"', '"']: | 
					
						
							|  |  |  |  |             check(e*3, '"""') | 
					
						
							|  |  |  |  |             check((e+';')*3, '"""') | 
					
						
							|  |  |  |  |         # check that the case is respected | 
					
						
							|  |  |  |  |         for e in ['&', '&', '&', '&']: | 
					
						
							|  |  |  |  |             check(e, '&') | 
					
						
							|  |  |  |  |         for e in ['&Amp', '&Amp;']: | 
					
						
							|  |  |  |  |             check(e, e) | 
					
						
							|  |  |  |  |         # check that non-existent named entities are returned unchanged | 
					
						
							|  |  |  |  |         check('&svadilfari;', '&svadilfari;') | 
					
						
							|  |  |  |  |         # the following examples are in the html5 specs | 
					
						
							|  |  |  |  |         check('¬it', '¬it') | 
					
						
							|  |  |  |  |         check('¬it;', '¬it;') | 
					
						
							|  |  |  |  |         check('¬in', '¬in') | 
					
						
							|  |  |  |  |         check('∉', '∉') | 
					
						
							|  |  |  |  |         # a similar example with a long name | 
					
						
							|  |  |  |  |         check('¬ReallyAnExistingNamedCharacterReference;', | 
					
						
							|  |  |  |  |               '¬ReallyAnExistingNamedCharacterReference;') | 
					
						
							|  |  |  |  |         # longest valid name | 
					
						
							|  |  |  |  |         check('∳', '∳') | 
					
						
							|  |  |  |  |         # check a charref that maps to two unicode chars | 
					
						
							|  |  |  |  |         check('∾̳', '\u223E\u0333') | 
					
						
							|  |  |  |  |         check('&acE', '&acE') | 
					
						
							|  |  |  |  |         # see #12888 | 
					
						
							|  |  |  |  |         check('{ ' * 1050, '{ ' * 1050) | 
					
						
							|  |  |  |  |         # see #15156 | 
					
						
							|  |  |  |  |         check('ÉricÉric&alphacentauriαcentauri', | 
					
						
							|  |  |  |  |               'ÉricÉric&alphacentauriαcentauri') | 
					
						
							|  |  |  |  |         check('&co;', '&co;') | 
					
						
							| 
									
										
										
										
											2010-10-15 15:57:45 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | if __name__ == '__main__': | 
					
						
							| 
									
										
										
										
											2013-11-19 20:28:45 +02:00
										 |  |  |  |     unittest.main() |