mirror of
https://github.com/openzim/warc2zim.git
synced 2025-10-19 14:33:17 +00:00
Move rules to YAML instead of JSON to add support for inline comments
This commit is contained in:
parent
3ba38f2eef
commit
a38fdc7b9f
9 changed files with 60 additions and 53 deletions
7
.github/workflows/Publish.yaml
vendored
7
.github/workflows/Publish.yaml
vendored
|
@ -19,8 +19,13 @@ jobs:
|
|||
python-version-file: pyproject.toml
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies (and project)
|
||||
run: |
|
||||
pip install -U pip
|
||||
pip install -e .[scripts]
|
||||
|
||||
- name: Generate fuzzy rules
|
||||
run: pip install jinja2==3.1.3 && python rules/generate_rules.py
|
||||
run: python rules/generate_rules.py
|
||||
|
||||
- name: Build Javascript wombatSetup.js
|
||||
uses: addnab/docker-run-action@v3
|
||||
|
|
2
.github/workflows/Tests.yaml
vendored
2
.github/workflows/Tests.yaml
vendored
|
@ -62,7 +62,7 @@ jobs:
|
|||
- name: Install dependencies (and project)
|
||||
run: |
|
||||
pip install -U pip build
|
||||
pip install -e .
|
||||
pip install -e .[scripts]
|
||||
|
||||
- name: Generate fuzzy rules
|
||||
run: python rules/generate_rules.py
|
||||
|
|
|
@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Changed
|
||||
|
||||
- Moved rules definition from JSON to YAML and documented update process (#216)
|
||||
|
||||
## [2.0.2] - 2024-06-18
|
||||
|
||||
### Added
|
||||
|
|
|
@ -14,7 +14,8 @@ python3 -m venv /local
|
|||
|
||||
/local/bin/python -m pip install --no-cache-dir -U \
|
||||
pip \
|
||||
jinja2==3.1.3
|
||||
jinja2==3.1.4 \
|
||||
PyYAML==6.0.1
|
||||
|
||||
/local/bin/python /src/rules/generate_rules.py
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
## Fuzzy rules
|
||||
|
||||
Fuzzy rules are stored in `rules/rules.json`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code.
|
||||
Fuzzy rules are stored in `rules/rules.yaml`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code.
|
||||
|
||||
Should you update these fuzzy rules, you hence have to:
|
||||
- regenerate Python and JS files by running `python rules/generateRules.py`
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
[build-system]
|
||||
# jinja2 is required to generate JS and Python rules at build time
|
||||
requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4"]
|
||||
# PyYAML is used to parse fuzzy rules and generate Python/JS code
|
||||
requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4", "PyYAML==6.0.1"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
|
@ -12,7 +13,7 @@ dependencies = [
|
|||
"warcio==1.7.4",
|
||||
"requests==2.32.3",
|
||||
"zimscraperlib==3.3.2",
|
||||
"jinja2==3.1.4",
|
||||
"jinja2==3.1.4", # also update version in build-system above and in build_js.sh
|
||||
# to support possible brotli content in warcs, must be added separately
|
||||
"brotlipy==0.7.0",
|
||||
"cdxj_indexer==1.4.5",
|
||||
|
@ -35,6 +36,7 @@ email="info@webrecorder.net"
|
|||
[project.optional-dependencies]
|
||||
scripts = [
|
||||
"invoke==2.2.0",
|
||||
"PyYAML==6.0.1", # used to parse fuzzy rules and generate Python/JS code ; also update version in build-system above and in build_js.sh
|
||||
]
|
||||
lint = [
|
||||
"black==24.4.2",
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from jinja2 import Environment
|
||||
|
||||
rules_src = Path(__file__).with_name("rules.json")
|
||||
rules_src = Path(__file__).with_name("rules.yaml")
|
||||
if not rules_src.exists():
|
||||
# This skip is usefull mostly for CI operations when installing only Python deps
|
||||
print("Skipping rules generation, rule file is missing")
|
||||
sys.exit()
|
||||
|
||||
FUZZY_RULES = json.loads(rules_src.read_text())["fuzzyRules"]
|
||||
FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"]
|
||||
|
||||
PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII)
|
||||
|
||||
|
|
|
@ -1,44 +0,0 @@
|
|||
{
|
||||
"fuzzyRules": [
|
||||
{
|
||||
"pattern": ".*googlevideo.com/(videoplayback(?=\\?)).*[?&](id=[^&]+).*",
|
||||
"replace": "youtube.fuzzy.replayweb.page/\\1?\\2"
|
||||
},
|
||||
{
|
||||
"pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com/(get_video_info\\?).*(video_id=[^&]+).*",
|
||||
"replace": "youtube.fuzzy.replayweb.page/\\1\\2"
|
||||
},
|
||||
{
|
||||
"pattern": "i\\.ytimg\\.com\\/vi\\/(.*?)\\/.*?\\.(\\w*?)(?:\\?.*|$)",
|
||||
"replace": "i.ytimg.com.fuzzy.replayweb.page/vi/\\1/thumbnail.\\2"
|
||||
},
|
||||
{
|
||||
"pattern": "([^?]+)\\?[\\d]+$",
|
||||
"replace": "\\1"
|
||||
},
|
||||
{
|
||||
"pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com\\/(youtubei\\/[^?]+).*(videoId[^&]+).*",
|
||||
"replace": "youtube.fuzzy.replayweb.page/\\1?\\2"
|
||||
},
|
||||
{
|
||||
"pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com/embed/([^?]+).*",
|
||||
"replace": "youtube.fuzzy.replayweb.page/embed/\\1"
|
||||
},
|
||||
{
|
||||
"pattern": ".*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\\.akamaized\\.net.*/(.+?.mp4)\\?.*range=(.*?)(?:&|$)",
|
||||
"replace": "vimeo-cdn.fuzzy.replayweb.page/\\1?range=\\2"
|
||||
},
|
||||
{
|
||||
"pattern": ".*(?:gcs-vimeo|vod|vod-progressive)\\.akamaized\\.net.*?/([\\d/]+.mp4)$",
|
||||
"replace": "vimeo-cdn.fuzzy.replayweb.page/\\1"
|
||||
},
|
||||
{
|
||||
"pattern": ".*player.vimeo.com/(video/[\\d]+)\\?.*",
|
||||
"replace": "vimeo.fuzzy.replayweb.page/\\1"
|
||||
},
|
||||
{
|
||||
"pattern": ".*i\\.vimeocdn\\.com\\/(.*)\\?.*",
|
||||
"replace": "i.vimeocdn.fuzzy.replayweb.page/\\1"
|
||||
}
|
||||
]
|
||||
}
|
39
rules/rules.yaml
Normal file
39
rules/rules.yaml
Normal file
|
@ -0,0 +1,39 @@
|
|||
# This file comes from an adaptation of rules present in
|
||||
# https://github.com/webrecorder/wabac.js/blame/main/src/fuzzymatcher.js
|
||||
#
|
||||
# Syncing rules is done manually, based on expert knowledge, especially because in
|
||||
# warc2zim we are not really fuzzy matching (searching the best entry among existing
|
||||
# ones) but just rewriting to proper path.
|
||||
#
|
||||
# This file is in sync with content at commit 879018d5b96962df82340a9a57570bbc0fc67815
|
||||
# from June 9, 2024
|
||||
#
|
||||
# This file should be updated at every release of warc2zim
|
||||
#
|
||||
# Some rules are voluntarily missing because not been tested in warc2zim yet: Twitter,
|
||||
# Washington Post, WixStatic, Facebook
|
||||
#
|
||||
# Generic rules are also ommitted on purpose, we don't need them
|
||||
#
|
||||
fuzzyRules:
|
||||
- pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).*
|
||||
replace: youtube.fuzzy.replayweb.page/\1?\2
|
||||
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).*
|
||||
replace : youtube.fuzzy.replayweb.page/\1\2
|
||||
- pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$)
|
||||
replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
|
||||
- pattern: ([^?]+)\?[\d]+$
|
||||
replace : \1
|
||||
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*
|
||||
replace : youtube.fuzzy.replayweb.page/\1?\2
|
||||
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*
|
||||
replace : youtube.fuzzy.replayweb.page/embed/\1
|
||||
# next one is a custom warc2zim rule intended to fix Vimeo support
|
||||
- pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*/(.+?.mp4)\?.*range=(.*?)(?:&|$)
|
||||
replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
|
||||
- pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+.mp4)$
|
||||
replace : vimeo-cdn.fuzzy.replayweb.page/\1
|
||||
- pattern: .*player.vimeo.com/(video/[\d]+)\?.*
|
||||
replace : vimeo.fuzzy.replayweb.page/\1
|
||||
- pattern: .*i\.vimeocdn\.com\/(.*)\?.*
|
||||
replace : i.vimeocdn.fuzzy.replayweb.page/\1
|
Loading…
Add table
Add a link
Reference in a new issue