mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	
		
			
	
	
		
			238 lines
		
	
	
	
		
			8.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			238 lines
		
	
	
	
		
			8.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | """
 | ||
|  | Try to detect suspicious constructs, resembling markup | ||
|  | that has leaked into the final output. | ||
|  | 
 | ||
|  | Suspicious lines are reported in a comma-separated-file, | ||
|  | ``suspicious.csv``, located in the output directory. | ||
|  | 
 | ||
|  | The file is utf-8 encoded, and each line contains four fields: | ||
|  | 
 | ||
|  |  * document name (normalized) | ||
|  |  * line number in the source document | ||
|  |  * problematic text | ||
|  |  * complete line showing the problematic text in context | ||
|  | 
 | ||
|  | It is common to find many false positives. To avoid reporting them | ||
|  | again and again, they may be added to the ``ignored.csv`` file | ||
|  | (located in the configuration directory). The file has the same | ||
|  | format as ``suspicious.csv`` with a few differences: | ||
|  | 
 | ||
|  |   - each line defines a rule; if the rule matches, the issue | ||
|  |     is ignored. | ||
|  |   - line number may be empty (that is, nothing between the | ||
|  |     commas: ",,"). In this case, line numbers are ignored (the | ||
|  |     rule matches anywhere in the file). | ||
|  |   - the last field does not have to be a complete line; some | ||
|  |     surrounding text (never more than a line) is enough for | ||
|  |     context. | ||
|  | 
 | ||
|  | Rules are processed sequentially. A rule matches when: | ||
|  | 
 | ||
|  |  * document names are the same | ||
|  |  * problematic texts are the same | ||
|  |  * line numbers are close to each other (5 lines up or down) | ||
|  |  * the rule text is completely contained into the source line | ||
|  | 
 | ||
|  | The simplest way to create the ignored.csv file is by copying | ||
|  | undesired entries from suspicious.csv (possibly trimming the last | ||
|  | field.) | ||
|  | 
 | ||
|  | Copyright 2009 Gabriel A. Genellina | ||
|  | 
 | ||
|  | """
 | ||
|  | 
 | ||
|  | import os, sys | ||
|  | import csv | ||
|  | import re | ||
|  | from docutils import nodes | ||
|  | from sphinx.builders import Builder | ||
|  | 
 | ||
|  | detect_all = re.compile(ur'''
 | ||
|  |     ::(?=[^=])|            # two :: (but NOT ::=) | ||
|  |     :[a-zA-Z][a-zA-Z0-9]+| # :foo | ||
|  |     `|                     # ` (seldom used by itself) | ||
|  |     (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:) | ||
|  |     ''', re.UNICODE | re.VERBOSE).finditer
 | ||
|  | 
 | ||
|  | class Rule: | ||
|  |     def __init__(self, docname, lineno, issue, line): | ||
|  |         "A rule for ignoring issues" | ||
|  |         self.docname = docname # document to which this rule applies | ||
|  |         self.lineno = lineno   # line number in the original source; | ||
|  |                                # this rule matches only near that. | ||
|  |                                # None -> don't care | ||
|  |         self.issue = issue     # the markup fragment that triggered this rule | ||
|  |         self.line = line       # text of the container element (single line only) | ||
|  | 
 | ||
|  | 
 | ||
|  | class CheckSuspiciousMarkupBuilder(Builder): | ||
|  |     """
 | ||
|  |     Checks for possibly invalid markup that may leak into the output | ||
|  |     """
 | ||
|  |     name = 'suspicious' | ||
|  | 
 | ||
|  |     def init(self): | ||
|  |         # create output file | ||
|  |         self.log_file_name = os.path.join(self.outdir, 'suspicious.csv') | ||
|  |         open(self.log_file_name, 'w').close() | ||
|  |         # load database of previously ignored issues | ||
|  |         self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv')) | ||
|  | 
 | ||
|  |     def get_outdated_docs(self): | ||
|  |         return self.env.found_docs | ||
|  | 
 | ||
|  |     def get_target_uri(self, docname, typ=None): | ||
|  |         return '' | ||
|  | 
 | ||
|  |     def prepare_writing(self, docnames): | ||
|  |         ### PYTHON PROJECT SPECIFIC ### | ||
|  |         for name in set(docnames): | ||
|  |             if name.split('/', 1)[0] == 'documenting': | ||
|  |                 docnames.remove(name) | ||
|  |         ### PYTHON PROJECT SPECIFIC ### | ||
|  | 
 | ||
|  |     def write_doc(self, docname, doctree): | ||
|  |         self.any_issue = False # set when any issue is encountered in this document | ||
|  |         self.docname = docname | ||
|  |         visitor = SuspiciousVisitor(doctree, self) | ||
|  |         doctree.walk(visitor) | ||
|  | 
 | ||
|  |     def finish(self): | ||
|  |         return | ||
|  | 
 | ||
|  |     def check_issue(self, line, lineno, issue): | ||
|  |         if not self.is_ignored(line, lineno, issue): | ||
|  |             self.report_issue(line, lineno, issue) | ||
|  | 
 | ||
|  |     def is_ignored(self, line, lineno, issue): | ||
|  |         """Determine whether this issue should be ignored.
 | ||
|  |         """
 | ||
|  |         docname = self.docname | ||
|  |         for rule in self.rules: | ||
|  |             if rule.docname != docname: continue | ||
|  |             if rule.issue != issue: continue | ||
|  |             # Both lines must match *exactly*. This is rather strict, | ||
|  |             # and probably should be improved. | ||
|  |             # Doing fuzzy matches with levenshtein distance could work, | ||
|  |             # but that means bringing other libraries... | ||
|  |             # Ok, relax that requirement: just check if the rule fragment | ||
|  |             # is contained in the document line | ||
|  |             if rule.line not in line: continue | ||
|  |             # Check both line numbers. If they're "near" | ||
|  |             # this rule matches. (lineno=None means "don't care") | ||
|  |             if (rule.lineno is not None) and \ | ||
|  |                 abs(rule.lineno - lineno) > 5: continue | ||
|  |             # if it came this far, the rule matched | ||
|  |             return True | ||
|  |         return False | ||
|  | 
 | ||
|  |     def report_issue(self, text, lineno, issue): | ||
|  |         if not self.any_issue: self.info() | ||
|  |         self.any_issue = True | ||
|  |         self.write_log_entry(lineno, issue, text) | ||
|  |         self.warn('[%s:%d] "%s" found in "%-.120s"' % ( | ||
|  |                 self.docname.encode(sys.getdefaultencoding(),'replace'), | ||
|  |                 lineno, | ||
|  |                 issue.encode(sys.getdefaultencoding(),'replace'), | ||
|  |                 text.strip().encode(sys.getdefaultencoding(),'replace'))) | ||
|  |         self.app.statuscode = 1 | ||
|  | 
 | ||
|  |     def write_log_entry(self, lineno, issue, text): | ||
|  |         f = open(self.log_file_name, 'ab') | ||
|  |         writer = csv.writer(f) | ||
|  |         writer.writerow([self.docname.encode('utf-8'), | ||
|  |                 lineno, | ||
|  |                 issue.encode('utf-8'), | ||
|  |                 text.strip().encode('utf-8')]) | ||
|  |         del writer | ||
|  |         f.close() | ||
|  | 
 | ||
|  |     def load_rules(self, filename): | ||
|  |         """Load database of previously ignored issues.
 | ||
|  | 
 | ||
|  |         A csv file, with exactly the same format as suspicious.csv | ||
|  |         Fields: document name (normalized), line number, issue, surrounding text | ||
|  |         """
 | ||
|  |         self.info("loading ignore rules... ", nonl=1) | ||
|  |         self.rules = rules = [] | ||
|  |         try: f = open(filename, 'rb') | ||
|  |         except IOError: return | ||
|  |         for i, row in enumerate(csv.reader(f)): | ||
|  |             if len(row) != 4: | ||
|  |                 raise ValueError, "wrong format in %s, line %d: %s" % (filename, i+1, row) | ||
|  |             docname, lineno, issue, text = row | ||
|  |             docname = docname.decode('utf-8') | ||
|  |             if lineno: lineno = int(lineno) | ||
|  |             else: lineno = None | ||
|  |             issue = issue.decode('utf-8') | ||
|  |             text = text.decode('utf-8') | ||
|  |             rule = Rule(docname, lineno, issue, text) | ||
|  |             rules.append(rule) | ||
|  |         f.close() | ||
|  |         self.info('done, %d rules loaded' % len(self.rules)) | ||
|  | 
 | ||
|  | 
 | ||
|  | def get_lineno(node): | ||
|  |     "Obtain line number information for a node" | ||
|  |     lineno = None | ||
|  |     while lineno is None and node: | ||
|  |         node = node.parent | ||
|  |         lineno = node.line | ||
|  |     return lineno | ||
|  | 
 | ||
|  | 
 | ||
|  | def extract_line(text, index): | ||
|  |     """text may be a multiline string; extract
 | ||
|  |     only the line containing the given character index. | ||
|  | 
 | ||
|  |     >>> extract_line("abc\ndefgh\ni", 6) | ||
|  |     >>> 'defgh' | ||
|  |     >>> for i in (0, 2, 3, 4, 10): | ||
|  |     ...   print extract_line("abc\ndefgh\ni", i) | ||
|  |     abc | ||
|  |     abc | ||
|  |     abc | ||
|  |     defgh | ||
|  |     defgh | ||
|  |     i | ||
|  |     """
 | ||
|  |     p = text.rfind('\n', 0, index) + 1 | ||
|  |     q = text.find('\n', index) | ||
|  |     if q<0: q = len(text) | ||
|  |     return text[p:q] | ||
|  | 
 | ||
|  | 
 | ||
|  | class SuspiciousVisitor(nodes.GenericNodeVisitor): | ||
|  | 
 | ||
|  |     lastlineno = 0 | ||
|  | 
 | ||
|  |     def __init__(self, document, builder): | ||
|  |         nodes.GenericNodeVisitor.__init__(self, document) | ||
|  |         self.builder = builder | ||
|  | 
 | ||
|  |     def default_visit(self, node): | ||
|  |         if isinstance(node, (nodes.Text, nodes.image)): # direct text containers | ||
|  |             text = node.astext() | ||
|  |             # lineno seems to go backwards sometimes (?) | ||
|  |             self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno) | ||
|  |             seen = set() # don't report the same issue more than only once per line | ||
|  |             for match in detect_all(text): | ||
|  |                 #import pdb; pdb.set_trace() | ||
|  |                 issue = match.group() | ||
|  |                 line = extract_line(text, match.start()) | ||
|  |                 if (issue, line) not in seen: | ||
|  |                     self.builder.check_issue(line, lineno, issue) | ||
|  |                     seen.add((issue, line)) | ||
|  | 
 | ||
|  |     unknown_visit = default_visit | ||
|  | 
 | ||
|  |     def visit_document(self, node): | ||
|  |         self.lastlineno = 0 | ||
|  | 
 | ||
|  |     def visit_comment(self, node): | ||
|  |         # ignore comments -- too much false positives. | ||
|  |         # (although doing this could miss some errors; | ||
|  |         # there were two sections "commented-out" by mistake | ||
|  |         # in the Python docs that would not be catched) | ||
|  |         raise nodes.SkipNode |