Move rules to YAML instead of JSON to add support for inline comments

This commit is contained in:
benoit74 2024-06-25 11:55:39 +00:00
parent 3ba38f2eef
commit a38fdc7b9f
No known key found for this signature in database
GPG key ID: B89606434FC7B530
9 changed files with 60 additions and 53 deletions

View file

@ -19,8 +19,13 @@ jobs:
python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
run: |
pip install -U pip
pip install -e .[scripts]
- name: Generate fuzzy rules
run: pip install jinja2==3.1.3 && python rules/generate_rules.py
run: python rules/generate_rules.py
- name: Build Javascript wombatSetup.js
uses: addnab/docker-run-action@v3

View file

@ -62,7 +62,7 @@ jobs:
- name: Install dependencies (and project)
run: |
pip install -U pip build
pip install -e .
pip install -e .[scripts]
- name: Generate fuzzy rules
run: python rules/generate_rules.py

View file

@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Changed
- Moved rules definition from JSON to YAML and documented update process (#216)
## [2.0.2] - 2024-06-18
### Added

View file

@ -14,7 +14,8 @@ python3 -m venv /local
/local/bin/python -m pip install --no-cache-dir -U \
pip \
jinja2==3.1.3
jinja2==3.1.4 \
PyYAML==6.0.1
/local/bin/python /src/rules/generate_rules.py

View file

@ -2,7 +2,7 @@
## Fuzzy rules
Fuzzy rules are stored in `rules/rules.json`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code.
Fuzzy rules are stored in `rules/rules.yaml`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code.
Should you update these fuzzy rules, you hence have to:
- regenerate Python and JS files by running `python rules/generateRules.py`

View file

@ -1,6 +1,7 @@
[build-system]
# jinja2 is required to generate JS and Python rules at build time
requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4"]
# PyYAML is used to parse fuzzy rules and generate Python/JS code
requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4", "PyYAML==6.0.1"]
build-backend = "hatchling.build"
[project]
@ -12,7 +13,7 @@ dependencies = [
"warcio==1.7.4",
"requests==2.32.3",
"zimscraperlib==3.3.2",
"jinja2==3.1.4",
"jinja2==3.1.4", # also update version in build-system above and in build_js.sh
# to support possible brotli content in warcs, must be added separately
"brotlipy==0.7.0",
"cdxj_indexer==1.4.5",
@ -35,6 +36,7 @@ email="info@webrecorder.net"
[project.optional-dependencies]
scripts = [
"invoke==2.2.0",
"PyYAML==6.0.1", # used to parse fuzzy rules and generate Python/JS code ; also update version in build-system above and in build_js.sh
]
lint = [
"black==24.4.2",

View file

@ -1,17 +1,17 @@
import json
import re
import sys
from pathlib import Path
import yaml
from jinja2 import Environment
rules_src = Path(__file__).with_name("rules.json")
rules_src = Path(__file__).with_name("rules.yaml")
if not rules_src.exists():
# This skip is usefull mostly for CI operations when installing only Python deps
print("Skipping rules generation, rule file is missing")
sys.exit()
FUZZY_RULES = json.loads(rules_src.read_text())["fuzzyRules"]
FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"]
PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII)

View file

@ -1,44 +0,0 @@
{
"fuzzyRules": [
{
"pattern": ".*googlevideo.com/(videoplayback(?=\\?)).*[?&](id=[^&]+).*",
"replace": "youtube.fuzzy.replayweb.page/\\1?\\2"
},
{
"pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com/(get_video_info\\?).*(video_id=[^&]+).*",
"replace": "youtube.fuzzy.replayweb.page/\\1\\2"
},
{
"pattern": "i\\.ytimg\\.com\\/vi\\/(.*?)\\/.*?\\.(\\w*?)(?:\\?.*|$)",
"replace": "i.ytimg.com.fuzzy.replayweb.page/vi/\\1/thumbnail.\\2"
},
{
"pattern": "([^?]+)\\?[\\d]+$",
"replace": "\\1"
},
{
"pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com\\/(youtubei\\/[^?]+).*(videoId[^&]+).*",
"replace": "youtube.fuzzy.replayweb.page/\\1?\\2"
},
{
"pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com/embed/([^?]+).*",
"replace": "youtube.fuzzy.replayweb.page/embed/\\1"
},
{
"pattern": ".*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\\.akamaized\\.net.*/(.+?.mp4)\\?.*range=(.*?)(?:&|$)",
"replace": "vimeo-cdn.fuzzy.replayweb.page/\\1?range=\\2"
},
{
"pattern": ".*(?:gcs-vimeo|vod|vod-progressive)\\.akamaized\\.net.*?/([\\d/]+.mp4)$",
"replace": "vimeo-cdn.fuzzy.replayweb.page/\\1"
},
{
"pattern": ".*player.vimeo.com/(video/[\\d]+)\\?.*",
"replace": "vimeo.fuzzy.replayweb.page/\\1"
},
{
"pattern": ".*i\\.vimeocdn\\.com\\/(.*)\\?.*",
"replace": "i.vimeocdn.fuzzy.replayweb.page/\\1"
}
]
}

39
rules/rules.yaml Normal file
View file

@ -0,0 +1,39 @@
# This file comes from an adaptation of rules present in
# https://github.com/webrecorder/wabac.js/blame/main/src/fuzzymatcher.js
#
# Syncing rules is done manually, based on expert knowledge, especially because in
# warc2zim we are not really fuzzy matching (searching the best entry among existing
# ones) but just rewriting to proper path.
#
# This file is in sync with content at commit 879018d5b96962df82340a9a57570bbc0fc67815
# from June 9, 2024
#
# This file should be updated at every release of warc2zim
#
# Some rules are voluntarily missing because not been tested in warc2zim yet: Twitter,
# Washington Post, WixStatic, Facebook
#
# Generic rules are also ommitted on purpose, we don't need them
#
fuzzyRules:
- pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).*
replace: youtube.fuzzy.replayweb.page/\1?\2
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).*
replace : youtube.fuzzy.replayweb.page/\1\2
- pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$)
replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
- pattern: ([^?]+)\?[\d]+$
replace : \1
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*
replace : youtube.fuzzy.replayweb.page/\1?\2
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*
replace : youtube.fuzzy.replayweb.page/embed/\1
# next one is a custom warc2zim rule intended to fix Vimeo support
- pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*/(.+?.mp4)\?.*range=(.*?)(?:&|$)
replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
- pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+.mp4)$
replace : vimeo-cdn.fuzzy.replayweb.page/\1
- pattern: .*player.vimeo.com/(video/[\d]+)\?.*
replace : vimeo.fuzzy.replayweb.page/\1
- pattern: .*i\.vimeocdn\.com\/(.*)\?.*
replace : i.vimeocdn.fuzzy.replayweb.page/\1