cpython/Doc/tools/check-html-ids.py

from compression import gzip
import concurrent.futures
from pathlib import Path
import html.parser
import functools
import argparse
import json
import sys
import re


IGNORED_ID_RE = re.compile(
    r"""
    index-\d+
    | id\d+
    | [_a-z]+_\d+
""",
    re.VERBOSE,
)


class IDGatherer(html.parser.HTMLParser):
    def __init__(self, ids):
        super().__init__()
        self.__ids = ids

    def handle_starttag(self, tag, attrs):
        for name, value in attrs:
            if name == 'id':
                if not IGNORED_ID_RE.fullmatch(value):
                    self.__ids.add(value)


def get_ids_from_file(path):
    ids = set()
    gatherer = IDGatherer(ids)
    with path.open(encoding='utf-8') as file:
        while chunk := file.read(4096):
            gatherer.feed(chunk)
    return ids


def gather_ids(htmldir, *, verbose_print):
    if not htmldir.joinpath('objects.inv').exists():
        raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')

    if sys._is_gil_enabled:
        pool = concurrent.futures.ProcessPoolExecutor()
    else:
        pool = concurrent.futures.ThreadPoolExecutor()
    tasks = {}
    for path in htmldir.glob('**/*.html'):
        relative_path = path.relative_to(htmldir)
        if '_static' in relative_path.parts:
            continue
        if 'whatsnew' in relative_path.parts:
            continue
        tasks[relative_path] = pool.submit(get_ids_from_file, path=path)

    ids_by_page = {}
    for relative_path, future in tasks.items():
        verbose_print(relative_path)
        ids = future.result()
        ids_by_page[str(relative_path)] = ids
        verbose_print(f'    - {len(ids)} ids found')

    common = set.intersection(*ids_by_page.values())
    verbose_print(f'Filtering out {len(common)} common ids')
    for key, page_ids in ids_by_page.items():
        ids_by_page[key] = sorted(page_ids - common)

    return ids_by_page


def do_check(baseline, checked, excluded, *, verbose_print):
    successful = True
    for name, baseline_ids in sorted(baseline.items()):
        try:
            checked_ids = checked[name]
        except KeyError:
            successful = False
            print(f'{name}: (page missing)')
            print()
        else:
            missing_ids = set(baseline_ids) - set(checked_ids)
            if missing_ids:
                missing_ids = {
                    a
                    for a in missing_ids
                    if not IGNORED_ID_RE.fullmatch(a)
                    and (name, a) not in excluded
                }
            if missing_ids:
                successful = False
                for missing_id in sorted(missing_ids):
                    print(f'{name}: {missing_id}')
                print()
    return successful


def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='print out more information',
    )
    subparsers = parser.add_subparsers(dest='command', required=True)

    collect = subparsers.add_parser(
        'collect', help='collect IDs from a set of HTML files'
    )
    collect.add_argument(
        'htmldir', type=Path, help='directory with HTML documentation'
    )
    collect.add_argument(
        '-o',
        '--outfile',
        help='File to save the result in; default <htmldir>/html-ids.json.gz',
    )

    check = subparsers.add_parser('check', help='check two archives of IDs')
    check.add_argument(
        'baseline_file', type=Path, help='file with baseline IDs'
    )
    check.add_argument('checked_file', type=Path, help='file with checked IDs')
    check.add_argument(
        '-x',
        '--exclude-file',
        type=Path,
        help='file with IDs to exclude from the check',
    )

    args = parser.parse_args(argv[1:])

    if args.verbose:
        verbose_print = functools.partial(print, file=sys.stderr)
    else:

        def verbose_print(*args, **kwargs):
            """do nothing"""

    if args.command == 'collect':
        ids = gather_ids(args.htmldir, verbose_print=verbose_print)
        if args.outfile is None:
            args.outfile = args.htmldir / 'html-ids.json.gz'
        with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
            json.dump({'ids_by_page': ids}, zfile)

    if args.command == 'check':
        with gzip.open(args.baseline_file) as zfile:
            baseline = json.load(zfile)['ids_by_page']
        with gzip.open(args.checked_file) as zfile:
            checked = json.load(zfile)['ids_by_page']
        excluded = set()
        if args.exclude_file:
            with open(args.exclude_file, encoding='utf-8') as file:
                for line in file:
                    line = line.strip()
                    if line and not line.startswith('#'):
                        name, sep, excluded_id = line.partition(':')
                        if sep:
                            excluded.add((name.strip(), excluded_id.strip()))
        if do_check(baseline, checked, excluded, verbose_print=verbose_print):
            verbose_print('All OK')
        else:
            sys.stdout.flush()
            print(
                'ERROR: Removed IDs found',
                'The above HTML IDs were removed from the documentation, '
                + 'resulting in broken links. Please add them back.',
                sep='\n',
                file=sys.stderr,
            )
            if args.exclude_file:
                print(f'Alternatively, add them to {args.exclude_file}.')


if __name__ == '__main__':
    main(sys.argv)