mirror of
https://github.com/python/cpython.git
synced 2026-04-13 23:31:02 +00:00
181 lines
5.5 KiB
Python
181 lines
5.5 KiB
Python
from compression import gzip
|
|
import concurrent.futures
|
|
from pathlib import Path
|
|
import html.parser
|
|
import functools
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import re
|
|
|
|
|
|
IGNORED_ID_RE = re.compile(
|
|
r"""
|
|
index-\d+
|
|
| id\d+
|
|
| [_a-z]+_\d+
|
|
""",
|
|
re.VERBOSE,
|
|
)
|
|
|
|
|
|
class IDGatherer(html.parser.HTMLParser):
|
|
def __init__(self, ids):
|
|
super().__init__()
|
|
self.__ids = ids
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
for name, value in attrs:
|
|
if name == 'id':
|
|
if not IGNORED_ID_RE.fullmatch(value):
|
|
self.__ids.add(value)
|
|
|
|
|
|
def get_ids_from_file(path):
|
|
ids = set()
|
|
gatherer = IDGatherer(ids)
|
|
with path.open(encoding='utf-8') as file:
|
|
while chunk := file.read(4096):
|
|
gatherer.feed(chunk)
|
|
return ids
|
|
|
|
|
|
def gather_ids(htmldir, *, verbose_print):
|
|
if not htmldir.joinpath('objects.inv').exists():
|
|
raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
|
|
|
|
if sys._is_gil_enabled:
|
|
pool = concurrent.futures.ProcessPoolExecutor()
|
|
else:
|
|
pool = concurrent.futures.ThreadPoolExecutor()
|
|
tasks = {}
|
|
for path in htmldir.glob('**/*.html'):
|
|
relative_path = path.relative_to(htmldir)
|
|
if '_static' in relative_path.parts:
|
|
continue
|
|
if 'whatsnew' in relative_path.parts:
|
|
continue
|
|
tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
|
|
|
|
ids_by_page = {}
|
|
for relative_path, future in tasks.items():
|
|
verbose_print(relative_path)
|
|
ids = future.result()
|
|
ids_by_page[str(relative_path)] = ids
|
|
verbose_print(f' - {len(ids)} ids found')
|
|
|
|
common = set.intersection(*ids_by_page.values())
|
|
verbose_print(f'Filtering out {len(common)} common ids')
|
|
for key, page_ids in ids_by_page.items():
|
|
ids_by_page[key] = sorted(page_ids - common)
|
|
|
|
return ids_by_page
|
|
|
|
|
|
def do_check(baseline, checked, excluded, *, verbose_print):
|
|
successful = True
|
|
for name, baseline_ids in sorted(baseline.items()):
|
|
try:
|
|
checked_ids = checked[name]
|
|
except KeyError:
|
|
successful = False
|
|
print(f'{name}: (page missing)')
|
|
print()
|
|
else:
|
|
missing_ids = set(baseline_ids) - set(checked_ids)
|
|
if missing_ids:
|
|
missing_ids = {
|
|
a
|
|
for a in missing_ids
|
|
if not IGNORED_ID_RE.fullmatch(a)
|
|
and (name, a) not in excluded
|
|
}
|
|
if missing_ids:
|
|
successful = False
|
|
for missing_id in sorted(missing_ids):
|
|
print(f'{name}: {missing_id}')
|
|
print()
|
|
return successful
|
|
|
|
|
|
def main(argv):
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
'-v',
|
|
'--verbose',
|
|
action='store_true',
|
|
help='print out more information',
|
|
)
|
|
subparsers = parser.add_subparsers(dest='command', required=True)
|
|
|
|
collect = subparsers.add_parser(
|
|
'collect', help='collect IDs from a set of HTML files'
|
|
)
|
|
collect.add_argument(
|
|
'htmldir', type=Path, help='directory with HTML documentation'
|
|
)
|
|
collect.add_argument(
|
|
'-o',
|
|
'--outfile',
|
|
help='File to save the result in; default <htmldir>/html-ids.json.gz',
|
|
)
|
|
|
|
check = subparsers.add_parser('check', help='check two archives of IDs')
|
|
check.add_argument(
|
|
'baseline_file', type=Path, help='file with baseline IDs'
|
|
)
|
|
check.add_argument('checked_file', type=Path, help='file with checked IDs')
|
|
check.add_argument(
|
|
'-x',
|
|
'--exclude-file',
|
|
type=Path,
|
|
help='file with IDs to exclude from the check',
|
|
)
|
|
|
|
args = parser.parse_args(argv[1:])
|
|
|
|
if args.verbose:
|
|
verbose_print = functools.partial(print, file=sys.stderr)
|
|
else:
|
|
|
|
def verbose_print(*args, **kwargs):
|
|
"""do nothing"""
|
|
|
|
if args.command == 'collect':
|
|
ids = gather_ids(args.htmldir, verbose_print=verbose_print)
|
|
if args.outfile is None:
|
|
args.outfile = args.htmldir / 'html-ids.json.gz'
|
|
with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
|
|
json.dump({'ids_by_page': ids}, zfile)
|
|
|
|
if args.command == 'check':
|
|
with gzip.open(args.baseline_file) as zfile:
|
|
baseline = json.load(zfile)['ids_by_page']
|
|
with gzip.open(args.checked_file) as zfile:
|
|
checked = json.load(zfile)['ids_by_page']
|
|
excluded = set()
|
|
if args.exclude_file:
|
|
with open(args.exclude_file, encoding='utf-8') as file:
|
|
for line in file:
|
|
line = line.strip()
|
|
if line and not line.startswith('#'):
|
|
name, sep, excluded_id = line.partition(':')
|
|
if sep:
|
|
excluded.add((name.strip(), excluded_id.strip()))
|
|
if do_check(baseline, checked, excluded, verbose_print=verbose_print):
|
|
verbose_print('All OK')
|
|
else:
|
|
sys.stdout.flush()
|
|
print(
|
|
'ERROR: Removed IDs found',
|
|
'The above HTML IDs were removed from the documentation, '
|
|
+ 'resulting in broken links. Please add them back.',
|
|
sep='\n',
|
|
file=sys.stderr,
|
|
)
|
|
if args.exclude_file:
|
|
print(f'Alternatively, add them to {args.exclude_file}.')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main(sys.argv)
|