mirror of
https://github.com/python/cpython.git
synced 2026-06-17 15:16:42 +00:00
[3.14] GH-145000: Add a tool to record/check removed HTML IDs (GH-145001) (GH-145212)
(cherry picked from commit 9b22261a86)
Co-authored-by: Petr Viktorin <encukou@gmail.com>
This commit is contained in:
parent
a7beca8ae3
commit
ff365ebe98
3 changed files with 190 additions and 0 deletions
|
|
@ -32,6 +32,9 @@ ignore = [
|
|||
"E501", # Ignore line length errors (we use auto-formatting)
|
||||
]
|
||||
|
||||
[lint.per-file-ignores]
|
||||
"tools/check-html-ids.py" = ["I001"] # Unsorted imports
|
||||
|
||||
[format]
|
||||
preview = true
|
||||
quote-style = "preserve"
|
||||
|
|
|
|||
|
|
@ -336,3 +336,9 @@ autobuild-stable-html:
|
|||
exit 1;; \
|
||||
esac
|
||||
@$(MAKE) autobuild-dev-html
|
||||
|
||||
# Collect HTML IDs to a JSON document
|
||||
.PHONY: html-ids
|
||||
html-ids:
|
||||
$(PYTHON) tools/check-html-ids.py collect build/html \
|
||||
-o build/html/html-ids.json.gz
|
||||
|
|
|
|||
181
Doc/tools/check-html-ids.py
Normal file
181
Doc/tools/check-html-ids.py
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
from compression import gzip
|
||||
import concurrent.futures
|
||||
from pathlib import Path
|
||||
import html.parser
|
||||
import functools
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import re
|
||||
|
||||
|
||||
IGNORED_ID_RE = re.compile(
|
||||
r"""
|
||||
index-\d+
|
||||
| id\d+
|
||||
| [_a-z]+_\d+
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
|
||||
class IDGatherer(html.parser.HTMLParser):
|
||||
def __init__(self, ids):
|
||||
super().__init__()
|
||||
self.__ids = ids
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
for name, value in attrs:
|
||||
if name == 'id':
|
||||
if not IGNORED_ID_RE.fullmatch(value):
|
||||
self.__ids.add(value)
|
||||
|
||||
|
||||
def get_ids_from_file(path):
|
||||
ids = set()
|
||||
gatherer = IDGatherer(ids)
|
||||
with path.open(encoding='utf-8') as file:
|
||||
while chunk := file.read(4096):
|
||||
gatherer.feed(chunk)
|
||||
return ids
|
||||
|
||||
|
||||
def gather_ids(htmldir, *, verbose_print):
|
||||
if not htmldir.joinpath('objects.inv').exists():
|
||||
raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
|
||||
|
||||
if sys._is_gil_enabled:
|
||||
pool = concurrent.futures.ProcessPoolExecutor()
|
||||
else:
|
||||
pool = concurrent.futures.ThreadPoolExecutor()
|
||||
tasks = {}
|
||||
for path in htmldir.glob('**/*.html'):
|
||||
relative_path = path.relative_to(htmldir)
|
||||
if '_static' in relative_path.parts:
|
||||
continue
|
||||
if 'whatsnew' in relative_path.parts:
|
||||
continue
|
||||
tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
|
||||
|
||||
ids_by_page = {}
|
||||
for relative_path, future in tasks.items():
|
||||
verbose_print(relative_path)
|
||||
ids = future.result()
|
||||
ids_by_page[str(relative_path)] = ids
|
||||
verbose_print(f' - {len(ids)} ids found')
|
||||
|
||||
common = set.intersection(*ids_by_page.values())
|
||||
verbose_print(f'Filtering out {len(common)} common ids')
|
||||
for key, page_ids in ids_by_page.items():
|
||||
ids_by_page[key] = sorted(page_ids - common)
|
||||
|
||||
return ids_by_page
|
||||
|
||||
|
||||
def do_check(baseline, checked, excluded, *, verbose_print):
|
||||
successful = True
|
||||
for name, baseline_ids in sorted(baseline.items()):
|
||||
try:
|
||||
checked_ids = checked[name]
|
||||
except KeyError:
|
||||
successful = False
|
||||
print(f'{name}: (page missing)')
|
||||
print()
|
||||
else:
|
||||
missing_ids = set(baseline_ids) - set(checked_ids)
|
||||
if missing_ids:
|
||||
missing_ids = {
|
||||
a
|
||||
for a in missing_ids
|
||||
if not IGNORED_ID_RE.fullmatch(a)
|
||||
and (name, a) not in excluded
|
||||
}
|
||||
if missing_ids:
|
||||
successful = False
|
||||
for missing_id in sorted(missing_ids):
|
||||
print(f'{name}: {missing_id}')
|
||||
print()
|
||||
return successful
|
||||
|
||||
|
||||
def main(argv):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'-v',
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='print out more information',
|
||||
)
|
||||
subparsers = parser.add_subparsers(dest='command', required=True)
|
||||
|
||||
collect = subparsers.add_parser(
|
||||
'collect', help='collect IDs from a set of HTML files'
|
||||
)
|
||||
collect.add_argument(
|
||||
'htmldir', type=Path, help='directory with HTML documentation'
|
||||
)
|
||||
collect.add_argument(
|
||||
'-o',
|
||||
'--outfile',
|
||||
help='File to save the result in; default <htmldir>/html-ids.json.gz',
|
||||
)
|
||||
|
||||
check = subparsers.add_parser('check', help='check two archives of IDs')
|
||||
check.add_argument(
|
||||
'baseline_file', type=Path, help='file with baseline IDs'
|
||||
)
|
||||
check.add_argument('checked_file', type=Path, help='file with checked IDs')
|
||||
check.add_argument(
|
||||
'-x',
|
||||
'--exclude-file',
|
||||
type=Path,
|
||||
help='file with IDs to exclude from the check',
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv[1:])
|
||||
|
||||
if args.verbose:
|
||||
verbose_print = functools.partial(print, file=sys.stderr)
|
||||
else:
|
||||
|
||||
def verbose_print(*args, **kwargs):
|
||||
"""do nothing"""
|
||||
|
||||
if args.command == 'collect':
|
||||
ids = gather_ids(args.htmldir, verbose_print=verbose_print)
|
||||
if args.outfile is None:
|
||||
args.outfile = args.htmldir / 'html-ids.json.gz'
|
||||
with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
|
||||
json.dump({'ids_by_page': ids}, zfile)
|
||||
|
||||
if args.command == 'check':
|
||||
with gzip.open(args.baseline_file) as zfile:
|
||||
baseline = json.load(zfile)['ids_by_page']
|
||||
with gzip.open(args.checked_file) as zfile:
|
||||
checked = json.load(zfile)['ids_by_page']
|
||||
excluded = set()
|
||||
if args.exclude_file:
|
||||
with open(args.exclude_file, encoding='utf-8') as file:
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
name, sep, excluded_id = line.partition(':')
|
||||
if sep:
|
||||
excluded.add((name.strip(), excluded_id.strip()))
|
||||
if do_check(baseline, checked, excluded, verbose_print=verbose_print):
|
||||
verbose_print('All OK')
|
||||
else:
|
||||
sys.stdout.flush()
|
||||
print(
|
||||
'ERROR: Removed IDs found',
|
||||
'The above HTML IDs were removed from the documentation, '
|
||||
+ 'resulting in broken links. Please add them back.',
|
||||
sep='\n',
|
||||
file=sys.stderr,
|
||||
)
|
||||
if args.exclude_file:
|
||||
print(f'Alternatively, add them to {args.exclude_file}.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
||||
Loading…
Add table
Add a link
Reference in a new issue