mirror of
https://github.com/openzim/warc2zim.git
synced 2025-10-19 14:33:17 +00:00
174 lines
5 KiB
Python
174 lines
5 KiB
Python
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
from jinja2 import Environment
|
|
|
|
rules_src = Path(__file__).with_name("rules.yaml")
|
|
if not rules_src.exists():
|
|
# This skip is usefull mostly for CI operations when installing only Python deps
|
|
print("Skipping rules generation, rule file is missing")
|
|
sys.exit()
|
|
|
|
FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"]
|
|
|
|
for rule in FUZZY_RULES:
|
|
if "name" not in rule:
|
|
raise SystemExit("Fuzzy rule is missing a name")
|
|
if "tests" not in rule or len(rule["tests"]) == 0:
|
|
raise SystemExit("Fuzzy rule is missing test cases")
|
|
|
|
|
|
PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII)
|
|
|
|
# Do not escape anything, we want to generate code as-is, it won't be interpreted as
|
|
# HTML anyway
|
|
JINJA_ENV = Environment(autoescape=False) # noqa: S701
|
|
|
|
### Generate Javascript code
|
|
|
|
js_code_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
|
|
|
|
export const fuzzyRules = [
|
|
{% for rule in FUZZY_RULES %} {
|
|
match: '{{ rule['match'] }}',
|
|
replace: '{{ rule['replace'] }}',
|
|
},
|
|
{% endfor %}
|
|
];
|
|
|
|
"""
|
|
|
|
js_parent = Path(__file__).joinpath("../../javascript/src").resolve()
|
|
if not js_parent.exists():
|
|
# This skip is usefull mostly for CI operations when working on the Python part
|
|
print("Skipping JS rules generation, target folder is missing")
|
|
else:
|
|
(js_parent / "fuzzyRules.js").write_text(
|
|
JINJA_ENV.from_string(js_code_template).render(
|
|
FUZZY_RULES=[
|
|
{
|
|
"match": rule["pattern"].replace("\\", "\\\\"),
|
|
"replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]),
|
|
}
|
|
for rule in FUZZY_RULES
|
|
]
|
|
)
|
|
)
|
|
print("JS rules generation completed successfully")
|
|
|
|
### Generate Javascript tests
|
|
|
|
js_test_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
|
|
|
|
import test from 'ava';
|
|
|
|
import { applyFuzzyRules } from '../src/wombatSetup.js';
|
|
|
|
{% for rule in FUZZY_RULES %}
|
|
{% for test in rule['tests'] %}
|
|
test('fuzzyrules_{{rule['name']}}_{{loop.index}}', (t) => {
|
|
t.is(
|
|
applyFuzzyRules(
|
|
'{{test['raw_url']}}',
|
|
),
|
|
'{{test['raw_url'] if test['unchanged'] else test['fuzzified_url']}}',
|
|
);
|
|
});
|
|
{% endfor %}
|
|
{% endfor %}
|
|
"""
|
|
|
|
js_parent = Path(__file__).joinpath("../../javascript/test").resolve()
|
|
if not js_parent.exists():
|
|
# This skip is usefull mostly for CI operations when working on the Python part
|
|
print("Skipping JS tests generation, target folder is missing")
|
|
else:
|
|
(js_parent / "fuzzyRules.js").write_text(
|
|
JINJA_ENV.from_string(js_test_template).render(
|
|
FUZZY_RULES=[
|
|
{
|
|
"name": rule["name"],
|
|
"tests": rule["tests"],
|
|
"match": rule["pattern"].replace("\\", "\\\\"),
|
|
"replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]),
|
|
}
|
|
for rule in FUZZY_RULES
|
|
]
|
|
)
|
|
)
|
|
print("JS tests generation completed successfully")
|
|
|
|
### Generate Python code
|
|
|
|
py_code_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
|
|
|
|
FUZZY_RULES = [
|
|
{% for rule in FUZZY_RULES %} {
|
|
"pattern": r"{{ rule['pattern'] }}",
|
|
"replace": r"{{ rule['replace'] }}",
|
|
},
|
|
{% endfor %}
|
|
]
|
|
"""
|
|
|
|
py_parent = Path(__file__).joinpath("../../src/warc2zim").resolve()
|
|
if not py_parent.exists():
|
|
# This skip is usefull mostly for CI operations when working on the JS part
|
|
print("Skipping Python rules generation, target folder is missing")
|
|
else:
|
|
(py_parent / "rules.py").absolute().write_text(
|
|
JINJA_ENV.from_string(py_code_template).render(FUZZY_RULES=FUZZY_RULES)
|
|
)
|
|
print("Python rules generation completed successfully")
|
|
|
|
### Generate Python tests
|
|
|
|
py_test_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
|
|
|
|
import pytest
|
|
|
|
from warc2zim.url_rewriting import apply_fuzzy_rules
|
|
|
|
from .utils import ContentForTests
|
|
|
|
{% for rule in FUZZY_RULES %}
|
|
@pytest.fixture(
|
|
params=[
|
|
{% for test in rule['tests'] %}
|
|
{% if test['unchanged'] %}
|
|
ContentForTests(
|
|
"{{ test['raw_url'] }}",
|
|
),
|
|
{% else %}
|
|
ContentForTests(
|
|
"{{ test['raw_url'] }}",
|
|
"{{ test['fuzzified_url'] }}",
|
|
),
|
|
{% endif %}
|
|
{% endfor %}
|
|
]
|
|
)
|
|
def {{ rule['name'] }}_case(request):
|
|
yield request.param
|
|
|
|
|
|
def test_fuzzyrules_{{ rule['name'] }}({{ rule['name'] }}_case):
|
|
assert (
|
|
apply_fuzzy_rules({{ rule['name'] }}_case.input_str)
|
|
== {{ rule['name'] }}_case.expected_str
|
|
)
|
|
{% endfor %}
|
|
|
|
"""
|
|
|
|
py_parent = Path(__file__).joinpath("../../tests").resolve()
|
|
if not py_parent.exists():
|
|
# This skip is usefull mostly for CI operations when working on the JS part
|
|
print("Skipping Python tests generation, target folder is missing")
|
|
else:
|
|
(py_parent / "test_fuzzy_rules.py").absolute().write_text(
|
|
JINJA_ENV.from_string(py_test_template).render(FUZZY_RULES=FUZZY_RULES)
|
|
)
|
|
print("Python tests generation completed successfully")
|