| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  | """Tests for the tokenize module.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The tests were originally written in the old Python style, where the | 
					
						
							|  |  |  | test output was compared to a golden file.  This docstring represents | 
					
						
							|  |  |  | the first steps towards rewriting the entire test as a doctest. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The tests can be really simple.  Given a small fragment of source | 
					
						
							|  |  |  | code, print out a table with the tokens.  The ENDMARK is omitted for | 
					
						
							|  |  |  | brevity. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | >>> dump_tokens("1 + 1") | 
					
						
							| 
									
										
										
										
											2006-08-23 21:14:03 +00:00
										 |  |  | NUMBER      '1'           (1, 0) (1, 1) | 
					
						
							|  |  |  | OP          '+'           (1, 2) (1, 3) | 
					
						
							|  |  |  | NUMBER      '1'           (1, 4) (1, 5) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A comment generates a token here, unlike in the parser module.  The | 
					
						
							|  |  |  | comment token is followed by an NL or a NEWLINE token, depending on | 
					
						
							|  |  |  | whether the line contains the completion of a statement. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | >>> dump_tokens("if False:\\n" | 
					
						
							|  |  |  | ...             "    # NL\\n" | 
					
						
							|  |  |  | ...             "    True = False # NEWLINE\\n") | 
					
						
							|  |  |  | NAME        'if'          (1, 0) (1, 2) | 
					
						
							|  |  |  | NAME        'False'       (1, 3) (1, 8) | 
					
						
							|  |  |  | OP          ':'           (1, 8) (1, 9) | 
					
						
							|  |  |  | NEWLINE     '\\n'          (1, 9) (1, 10) | 
					
						
							|  |  |  | COMMENT     '# NL'        (2, 4) (2, 8) | 
					
						
							|  |  |  | NL          '\\n'          (2, 8) (2, 9) | 
					
						
							|  |  |  | INDENT      '    '        (3, 0) (3, 4) | 
					
						
							|  |  |  | NAME        'True'        (3, 4) (3, 8) | 
					
						
							|  |  |  | OP          '='           (3, 9) (3, 10) | 
					
						
							|  |  |  | NAME        'False'       (3, 11) (3, 16) | 
					
						
							|  |  |  | COMMENT     '# NEWLINE'   (3, 17) (3, 26) | 
					
						
							|  |  |  | NEWLINE     '\\n'          (3, 26) (3, 27) | 
					
						
							|  |  |  | DEDENT      ''            (4, 0) (4, 0) | 
					
						
							| 
									
										
										
										
											2006-08-25 22:05:39 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | There will be a bunch more tests of specific source patterns. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The tokenize module also defines an untokenize function that should | 
					
						
							| 
									
										
										
										
											2006-08-23 21:14:03 +00:00
										 |  |  | regenerate the original program text from the tokens. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | There are some standard formatting practices that are easy to get right. | 
					
						
							| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | >>> roundtrip("if x == 1:\\n" | 
					
						
							| 
									
										
										
										
											2006-08-25 22:05:39 +00:00
										 |  |  | ...           "    print x\\n") | 
					
						
							| 
									
										
										
										
											2006-08-23 21:14:03 +00:00
										 |  |  | if x == 1: | 
					
						
							|  |  |  |     print x | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Some people use different formatting conventions, which makes | 
					
						
							|  |  |  | untokenize a little trickier.  Note that this test involves trailing | 
					
						
							| 
									
										
										
										
											2006-08-25 22:26:21 +00:00
										 |  |  | whitespace after the colon.  Note that we use hex escapes to make the | 
					
						
							|  |  |  | two trailing blanks apparent in the expected output. | 
					
						
							| 
									
										
										
										
											2006-08-23 21:14:03 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | >>> roundtrip("if   x  ==  1  :  \\n" | 
					
						
							|  |  |  | ...           "  print x\\n") | 
					
						
							| 
									
										
										
										
											2006-08-25 22:26:21 +00:00
										 |  |  | if   x  ==  1  :\x20\x20 | 
					
						
							| 
									
										
										
										
											2006-08-23 21:14:03 +00:00
										 |  |  |   print x | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Comments need to go in the right place. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | >>> roundtrip("if x == 1:\\n" | 
					
						
							|  |  |  | ...           "    # A comment by itself.\\n" | 
					
						
							|  |  |  | ...           "    print x  # Comment here, too.\\n" | 
					
						
							|  |  |  | ...           "    # Another comment.\\n" | 
					
						
							|  |  |  | ...           "after_if = True\\n") | 
					
						
							|  |  |  | if x == 1: | 
					
						
							|  |  |  |     # A comment by itself. | 
					
						
							|  |  |  |     print x  # Comment here, too. | 
					
						
							|  |  |  |     # Another comment. | 
					
						
							|  |  |  | after_if = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | >>> roundtrip("if (x  # The comments need to go in the right place\\n" | 
					
						
							|  |  |  | ...           "    == 1):\\n" | 
					
						
							|  |  |  | ...           "    print 'x == 1'\\n") | 
					
						
							|  |  |  | if (x  # The comments need to go in the right place | 
					
						
							|  |  |  |     == 1): | 
					
						
							|  |  |  |     print 'x == 1' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-09-02 19:40:19 +00:00
										 |  |  | import os, glob, random, time, sys | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  | from cStringIO import StringIO | 
					
						
							|  |  |  | from test.test_support import (verbose, findfile, is_resource_enabled, | 
					
						
							|  |  |  |                                TestFailed) | 
					
						
							| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  | from tokenize import (tokenize, generate_tokens, untokenize, tok_name, | 
					
						
							| 
									
										
										
										
											2006-08-23 21:14:03 +00:00
										 |  |  |                       ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT) | 
					
						
							| 
									
										
										
										
											1997-10-27 22:15:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-09-02 19:40:19 +00:00
										 |  |  | # How much time in seconds can pass before we print a 'Still working' message. | 
					
						
							|  |  |  | _PRINT_WORKING_MSG_INTERVAL = 5 * 60 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  | # Test roundtrip for `untokenize`.  `f` is a file path.  The source code in f | 
					
						
							|  |  |  | # is tokenized, converted back to source code via tokenize.untokenize(), | 
					
						
							|  |  |  | # and tokenized again from the latter.  The test fails if the second | 
					
						
							|  |  |  | # tokenization doesn't match the first. | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  | def test_roundtrip(f): | 
					
						
							|  |  |  |     ## print 'Testing:', f | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  |     fobj = open(f) | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  |         fulltok = list(generate_tokens(fobj.readline)) | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  |     finally: | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  |         fobj.close() | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     t1 = [tok[:2] for tok in fulltok] | 
					
						
							|  |  |  |     newtext = untokenize(t1) | 
					
						
							|  |  |  |     readline = iter(newtext.splitlines(1)).next | 
					
						
							|  |  |  |     t2 = [tok[:2] for tok in generate_tokens(readline)] | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  |     if t1 != t2: | 
					
						
							|  |  |  |         raise TestFailed("untokenize() roundtrip failed for %r" % f) | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  | def dump_tokens(s): | 
					
						
							|  |  |  |     """Print out the tokens in s in a table format.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The ENDMARKER is omitted. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     f = StringIO(s) | 
					
						
							|  |  |  |     for type, token, start, end, line in generate_tokens(f.readline): | 
					
						
							|  |  |  |         if type == ENDMARKER: | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  |         type = tok_name[type] | 
					
						
							| 
									
										
										
										
											2006-08-23 21:14:03 +00:00
										 |  |  |         print "%(type)-10.10s  %(token)-13.13r %(start)s %(end)s" % locals() | 
					
						
							| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | def roundtrip(s): | 
					
						
							|  |  |  |     f = StringIO(s) | 
					
						
							| 
									
										
										
										
											2006-08-23 21:14:03 +00:00
										 |  |  |     source = untokenize(generate_tokens(f.readline)) | 
					
						
							|  |  |  |     print source, | 
					
						
							| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  | # This is an example from the docs, set up as a doctest. | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  | def decistmt(s): | 
					
						
							|  |  |  |     """Substitute Decimals for floats in a string of statements.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     >>> from decimal import Decimal | 
					
						
							|  |  |  |     >>> s = 'print +21.3e-5*-.1234/81.7' | 
					
						
							|  |  |  |     >>> decistmt(s) | 
					
						
							|  |  |  |     "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  |     The format of the exponent is inherited from the platform C library. | 
					
						
							|  |  |  |     Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since | 
					
						
							|  |  |  |     we're only showing 12 digits, and the 13th isn't close to 5, the | 
					
						
							|  |  |  |     rest of the output should be platform-independent. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     >>> exec(s) #doctest: +ELLIPSIS | 
					
						
							|  |  |  |     -3.21716034272e-0...7 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Output from calculations with Decimal should be identical across all | 
					
						
							|  |  |  |     platforms. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  |     >>> exec(decistmt(s)) | 
					
						
							|  |  |  |     -3.217160342717258261933904529E-7 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  |     result = [] | 
					
						
							|  |  |  |     g = generate_tokens(StringIO(s).readline)   # tokenize the string | 
					
						
							|  |  |  |     for toknum, tokval, _, _, _  in g: | 
					
						
							|  |  |  |         if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens | 
					
						
							|  |  |  |             result.extend([ | 
					
						
							|  |  |  |                 (NAME, 'Decimal'), | 
					
						
							|  |  |  |                 (OP, '('), | 
					
						
							|  |  |  |                 (STRING, repr(tokval)), | 
					
						
							|  |  |  |                 (OP, ')') | 
					
						
							|  |  |  |             ]) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             result.append((toknum, tokval)) | 
					
						
							|  |  |  |     return untokenize(result) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  | def test_main(): | 
					
						
							|  |  |  |     if verbose: | 
					
						
							|  |  |  |         print 'starting...' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-09-02 19:40:19 +00:00
										 |  |  |     next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  |     # This displays the tokenization of tokenize_tests.py to stdout, and | 
					
						
							|  |  |  |     # regrtest.py checks that this equals the expected output (in the | 
					
						
							|  |  |  |     # test/output/ directory). | 
					
						
							|  |  |  |     f = open(findfile('tokenize_tests' + os.extsep + 'txt')) | 
					
						
							|  |  |  |     tokenize(f.readline) | 
					
						
							|  |  |  |     f.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Now run test_roundtrip() over tokenize_test.py too, and over all | 
					
						
							|  |  |  |     # (if the "compiler" resource is enabled) or a small random sample (if | 
					
						
							|  |  |  |     # "compiler" is not enabled) of the test*.py files. | 
					
						
							|  |  |  |     f = findfile('tokenize_tests' + os.extsep + 'txt') | 
					
						
							|  |  |  |     test_roundtrip(f) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     testdir = os.path.dirname(f) or os.curdir | 
					
						
							|  |  |  |     testfiles = glob.glob(testdir + os.sep + 'test*.py') | 
					
						
							|  |  |  |     if not is_resource_enabled('compiler'): | 
					
						
							|  |  |  |         testfiles = random.sample(testfiles, 10) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for f in testfiles: | 
					
						
							| 
									
										
										
										
											2006-09-02 19:40:19 +00:00
										 |  |  |         # Print still working message since this test can be really slow | 
					
						
							|  |  |  |         if next_time <= time.time(): | 
					
						
							|  |  |  |             next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL | 
					
						
							|  |  |  |             print >>sys.__stdout__, '  test_main still working, be patient...' | 
					
						
							|  |  |  |             sys.__stdout__.flush() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  |         test_roundtrip(f) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Test detecton of IndentationError. | 
					
						
							|  |  |  |     sampleBadText = """\
 | 
					
						
							|  |  |  | def foo(): | 
					
						
							|  |  |  |     bar | 
					
						
							|  |  |  |   baz | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         for tok in generate_tokens(StringIO(sampleBadText).readline): | 
					
						
							|  |  |  |             pass | 
					
						
							|  |  |  |     except IndentationError: | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         raise TestFailed("Did not detect IndentationError:") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Run the doctests in this module. | 
					
						
							|  |  |  |     from test import test_tokenize  # i.e., this module | 
					
						
							|  |  |  |     from test.test_support import run_doctest | 
					
						
							| 
									
										
										
										
											2006-08-23 18:37:43 +00:00
										 |  |  |     run_doctest(test_tokenize, verbose) | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if verbose: | 
					
						
							|  |  |  |         print 'finished' | 
					
						
							| 
									
										
										
										
											2005-06-10 11:05:19 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-31 03:17:30 +00:00
										 |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     test_main() |