mirror of
https://github.com/python/cpython.git
synced 2026-03-01 10:30:55 +00:00
The original tool wasn't working right and it was simpler to create a new one, partially re-using some of the old code. At this point the tool runs properly on the master. (Try: ./python Tools/c-analyzer/c-analyzer.py analyze.) It take ~40 seconds on my machine to analyze the full CPython code base. Note that we'll need to iron out some OS-specific stuff (e.g. preprocessor). We're okay though since this tool isn't used yet in our workflow. We will also need to verify the analysis results in detail before activating the check in CI, though I'm pretty sure it's close. https://bugs.python.org/issue36876
212 lines
7.3 KiB
Python
212 lines
7.3 KiB
Python
"""A simple non-validating parser for C99.
|
|
|
|
The functions and regex patterns here are not entirely suitable for
|
|
validating C syntax. Please rely on a proper compiler for that.
|
|
Instead our goal here is merely matching and extracting information from
|
|
valid C code.
|
|
|
|
Furthermore, the grammar rules for the C syntax (particularly as
|
|
described in the K&R book) actually describe a superset, of which the
|
|
full C langage is a proper subset. Here are some of the extra
|
|
conditions that must be applied when parsing C code:
|
|
|
|
* ...
|
|
|
|
(see: http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf)
|
|
|
|
We have taken advantage of the elements of the C grammar that are used
|
|
only in a few limited contexts, mostly as delimiters. They allow us to
|
|
focus the regex patterns confidently. Here are the relevant tokens and
|
|
in which grammar rules they are used:
|
|
|
|
separators:
|
|
* ";"
|
|
+ (decl) struct/union: at end of each member decl
|
|
+ (decl) declaration: at end of each (non-compound) decl
|
|
+ (stmt) expr stmt: at end of each stmt
|
|
+ (stmt) for: between exprs in "header"
|
|
+ (stmt) goto: at end
|
|
+ (stmt) continue: at end
|
|
+ (stmt) break: at end
|
|
+ (stmt) return: at end
|
|
* ","
|
|
+ (decl) struct/union: between member declators
|
|
+ (decl) param-list: between params
|
|
+ (decl) enum: between enumerators
|
|
+ (decl) initializer (compound): between initializers
|
|
+ (expr) postfix: between func call args
|
|
+ (expr) expression: between "assignment" exprs
|
|
* ":"
|
|
+ (decl) struct/union: in member declators
|
|
+ (stmt) label: between label and stmt
|
|
+ (stmt) case: between expression and stmt
|
|
+ (stmt) default: between "default" and stmt
|
|
* "="
|
|
+ (decl) delaration: between decl and initializer
|
|
+ (decl) enumerator: between identifier and "initializer"
|
|
+ (expr) assignment: between "var" and expr
|
|
|
|
wrappers:
|
|
* "(...)"
|
|
+ (decl) declarator (func ptr): to wrap ptr/name
|
|
+ (decl) declarator (func ptr): around params
|
|
+ (decl) declarator: around sub-declarator (for readability)
|
|
+ (expr) postfix (func call): around args
|
|
+ (expr) primary: around sub-expr
|
|
+ (stmt) if: around condition
|
|
+ (stmt) switch: around source expr
|
|
+ (stmt) while: around condition
|
|
+ (stmt) do-while: around condition
|
|
+ (stmt) for: around "header"
|
|
* "{...}"
|
|
+ (decl) enum: around enumerators
|
|
+ (decl) func: around body
|
|
+ (stmt) compound: around stmts
|
|
* "[...]"
|
|
* (decl) declarator: for arrays
|
|
* (expr) postfix: array access
|
|
|
|
other:
|
|
* "*"
|
|
+ (decl) declarator: for pointer types
|
|
+ (expr) unary: for pointer deref
|
|
|
|
|
|
To simplify the regular expressions used here, we've takens some
|
|
shortcuts and made certain assumptions about the code we are parsing.
|
|
Some of these allow us to skip context-sensitive matching (e.g. braces)
|
|
or otherwise still match arbitrary C code unambiguously. However, in
|
|
some cases there are certain corner cases where the patterns are
|
|
ambiguous relative to arbitrary C code. However, they are still
|
|
unambiguous in the specific code we are parsing.
|
|
|
|
Here are the cases where we've taken shortcuts or made assumptions:
|
|
|
|
* there is no overlap syntactically between the local context (func
|
|
bodies) and the global context (other than variable decls), so we
|
|
do not need to worry about ambiguity due to the overlap:
|
|
+ the global context has no expressions or statements
|
|
+ the local context has no function definitions or type decls
|
|
* no "inline" type declarations (struct, union, enum) in function
|
|
parameters ~(including function pointers)~
|
|
* no "inline" type decls in function return types
|
|
* no superflous parentheses in declarators
|
|
* var decls in for loops are always "simple" (e.g. no inline types)
|
|
* only inline struct/union/enum decls may be anonymouns (without a name)
|
|
* no function pointers in function pointer parameters
|
|
* for loop "headers" do not have curly braces (e.g. compound init)
|
|
* syntactically, variable decls do not overlap with stmts/exprs, except
|
|
in the following case:
|
|
spam (*eggs) (...)
|
|
This could be either a function pointer variable named "eggs"
|
|
or a call to a function named "spam", which returns a function
|
|
pointer that gets called. The only differentiator is the
|
|
syntax used in the "..." part. It will be comma-separated
|
|
parameters for the former and comma-separated expressions for
|
|
the latter. Thus, if we expect such decls or calls then we must
|
|
parse the decl params.
|
|
"""
|
|
|
|
"""
|
|
TODO:
|
|
* extract CPython-specific code
|
|
* drop include injection (or only add when needed)
|
|
* track position instead of slicing "text"
|
|
* Parser class instead of the _iter_source() mess
|
|
* alt impl using a state machine (& tokenizer or split on delimiters)
|
|
"""
|
|
|
|
from ..info import ParsedItem
|
|
from ._info import SourceInfo
|
|
|
|
|
|
def parse(srclines):
|
|
if isinstance(srclines, str): # a filename
|
|
raise NotImplementedError
|
|
|
|
anon_name = anonymous_names()
|
|
for result in _parse(srclines, anon_name):
|
|
yield ParsedItem.from_raw(result)
|
|
|
|
|
|
# XXX Later: Add a separate function to deal with preprocessor directives
|
|
# parsed out of raw source.
|
|
|
|
|
|
def anonymous_names():
|
|
counter = 1
|
|
def anon_name(prefix='anon-'):
|
|
nonlocal counter
|
|
name = f'{prefix}{counter}'
|
|
counter += 1
|
|
return name
|
|
return anon_name
|
|
|
|
|
|
#############################
|
|
# internal impl
|
|
|
|
import logging
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _parse(srclines, anon_name):
|
|
from ._global import parse_globals
|
|
|
|
source = _iter_source(srclines)
|
|
#source = _iter_source(srclines, showtext=True)
|
|
for result in parse_globals(source, anon_name):
|
|
# XXX Handle blocks here insted of in parse_globals().
|
|
yield result
|
|
|
|
|
|
def _iter_source(lines, *, maxtext=20_000, maxlines=700, showtext=False):
|
|
filestack = []
|
|
allinfo = {}
|
|
# "lines" should be (fileinfo, data), as produced by the preprocessor code.
|
|
for fileinfo, line in lines:
|
|
if fileinfo.filename in filestack:
|
|
while fileinfo.filename != filestack[-1]:
|
|
filename = filestack.pop()
|
|
del allinfo[filename]
|
|
filename = fileinfo.filename
|
|
srcinfo = allinfo[filename]
|
|
else:
|
|
filename = fileinfo.filename
|
|
srcinfo = SourceInfo(filename)
|
|
filestack.append(filename)
|
|
allinfo[filename] = srcinfo
|
|
|
|
_logger.debug(f'-> {line}')
|
|
srcinfo._add_line(line, fileinfo.lno)
|
|
if len(srcinfo.text) > maxtext:
|
|
break
|
|
if srcinfo.end - srcinfo.start > maxlines:
|
|
break
|
|
while srcinfo._used():
|
|
yield srcinfo
|
|
if showtext:
|
|
_logger.debug(f'=> {srcinfo.text}')
|
|
else:
|
|
if not filestack:
|
|
srcinfo = SourceInfo('???')
|
|
else:
|
|
filename = filestack[-1]
|
|
srcinfo = allinfo[filename]
|
|
while srcinfo._used():
|
|
yield srcinfo
|
|
if showtext:
|
|
_logger.debug(f'=> {srcinfo.text}')
|
|
yield srcinfo
|
|
if showtext:
|
|
_logger.debug(f'=> {srcinfo.text}')
|
|
if not srcinfo._ready:
|
|
return
|
|
# At this point either the file ended prematurely
|
|
# or there's "too much" text.
|
|
filename, lno, text = srcinfo.filename, srcinfo._start, srcinfo.text
|
|
if len(text) > 500:
|
|
text = text[:500] + '...'
|
|
raise Exception(f'unmatched text ({filename} starting at line {lno}):\n{text}')
|