mirror of
https://github.com/openzim/warc2zim.git
synced 2025-10-19 06:23:16 +00:00
Adapt to zimscraperlib 5.0.0 - including all rewriting logic moved there - and upgrade other dependencies
This commit is contained in:
parent
5040eeeffb
commit
1218df0560
46 changed files with 127 additions and 8886 deletions
13
.github/workflows/Publish.yaml
vendored
13
.github/workflows/Publish.yaml
vendored
|
@ -6,7 +6,7 @@ on:
|
|||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
id-token: write # mandatory for PyPI trusted publishing
|
||||
|
||||
|
@ -24,17 +24,6 @@ jobs:
|
|||
pip install -U pip
|
||||
pip install -e .[scripts]
|
||||
|
||||
- name: Generate fuzzy rules
|
||||
run: python rules/generate_rules.py
|
||||
|
||||
- name: Build Javascript wombatSetup.js
|
||||
uses: addnab/docker-run-action@v3
|
||||
with:
|
||||
image: node:20-bookworm
|
||||
options: -v ${{ github.workspace }}/src/warc2zim/statics:/output -v ${{ github.workspace }}/rules:/src/rules -v ${{ github.workspace }}/javascript:/src/javascript -v ${{ github.workspace }}/build_js.sh:/src/build_js.sh
|
||||
run: |
|
||||
/src/build_js.sh
|
||||
|
||||
- name: Build packages
|
||||
run: |
|
||||
pip install -U pip build
|
||||
|
|
10
.github/workflows/PublishDockerDevImage.yaml
vendored
10
.github/workflows/PublishDockerDevImage.yaml
vendored
|
@ -7,19 +7,11 @@ on:
|
|||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build Javascript wombatSetup.js
|
||||
uses: addnab/docker-run-action@v3
|
||||
with:
|
||||
image: node:20-bookworm
|
||||
options: -v ${{ github.workspace }}/src/warc2zim/statics:/output -v ${{ github.workspace }}/rules:/src/rules -v ${{ github.workspace }}/javascript:/src/javascript -v ${{ github.workspace }}/build_js.sh:/src/build_js.sh
|
||||
run: |
|
||||
/src/build_js.sh
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: openzim/docker-publish-action@v10
|
||||
with:
|
||||
|
|
22
.github/workflows/QA.yaml
vendored
22
.github/workflows/QA.yaml
vendored
|
@ -8,7 +8,7 @@ on:
|
|||
|
||||
jobs:
|
||||
check-qa:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
@ -24,9 +24,6 @@ jobs:
|
|||
pip install -U pip
|
||||
pip install -e .[lint,scripts,test,check]
|
||||
|
||||
- name: Generate fuzzy rules
|
||||
run: python rules/generate_rules.py
|
||||
|
||||
- name: Check black formatting
|
||||
run: inv lint-black
|
||||
|
||||
|
@ -35,20 +32,3 @@ jobs:
|
|||
|
||||
- name: Check pyright
|
||||
run: inv check-pyright
|
||||
|
||||
- name: Set up Node.JS
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
|
||||
- name: Install JS dependencies
|
||||
working-directory: javascript
|
||||
run: yarn install
|
||||
|
||||
- name: Check prettier formatting
|
||||
working-directory: javascript
|
||||
run: yarn prettier-check
|
||||
|
||||
- name: Check eslint rules
|
||||
working-directory: javascript
|
||||
run: yarn eslint
|
||||
|
|
2
.github/workflows/TestWebsite.yaml
vendored
2
.github/workflows/TestWebsite.yaml
vendored
|
@ -7,7 +7,7 @@ on:
|
|||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
30
.github/workflows/Tests.yaml
vendored
30
.github/workflows/Tests.yaml
vendored
|
@ -8,7 +8,7 @@ on:
|
|||
|
||||
jobs:
|
||||
run-tests:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
@ -24,9 +24,6 @@ jobs:
|
|||
pip install -U pip
|
||||
pip install -e .[test,scripts]
|
||||
|
||||
- name: Generate fuzzy rules
|
||||
run: python rules/generate_rules.py
|
||||
|
||||
- name: Run the tests
|
||||
run: inv coverage --args "-vvv"
|
||||
|
||||
|
@ -35,21 +32,8 @@ jobs:
|
|||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
- name: Set up Node.JS
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
|
||||
- name: Install JS dependencies
|
||||
working-directory: javascript
|
||||
run: yarn install
|
||||
|
||||
- name: Run JS tests
|
||||
working-directory: javascript
|
||||
run: yarn test
|
||||
|
||||
build_python:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
@ -59,21 +43,13 @@ jobs:
|
|||
python-version-file: pyproject.toml
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies (and project)
|
||||
run: |
|
||||
pip install -U pip build
|
||||
pip install -e .[scripts]
|
||||
|
||||
- name: Generate fuzzy rules
|
||||
run: python rules/generate_rules.py
|
||||
|
||||
- name: Ensure we can build Python targets
|
||||
run: |
|
||||
pip install -U pip build
|
||||
python3 -m build --sdist --wheel
|
||||
|
||||
build_docker:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
|
12
.gitignore
vendored
12
.gitignore
vendored
|
@ -495,18 +495,6 @@ pyrightconfig.json
|
|||
# ignore all vscode, this is not standard configuration in this place
|
||||
.vscode
|
||||
|
||||
# installed at build time
|
||||
src/warc2zim/statics/wombat.js
|
||||
|
||||
# temporary directories used during development
|
||||
output
|
||||
tmp
|
||||
|
||||
# rule files are generated by rules/generate_rules.py
|
||||
src/warc2zim/rules.py
|
||||
tests/test_fuzzy_rules.py
|
||||
javascript/src/fuzzyRules.js
|
||||
javascript/test/fuzzyRules.js
|
||||
|
||||
# wombatSetup.js is generated with rollup
|
||||
src/warc2zim/statics/wombatSetup.js
|
||||
|
|
|
@ -11,11 +11,11 @@ repos:
|
|||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.6.9
|
||||
rev: v0.8.4
|
||||
hooks:
|
||||
- id: ruff
|
||||
- repo: https://github.com/RobertCraigie/pyright-python
|
||||
rev: v1.1.383
|
||||
rev: v1.1.391
|
||||
hooks:
|
||||
- id: pyright
|
||||
name: pyright (system)
|
||||
|
|
|
@ -9,7 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
### Changed
|
||||
|
||||
- Upgrade to wombat 3.8.6 (#334)
|
||||
- Upgrade dependencies: zimscraperlib 5.0.0, warcio 1.7.5, cdxj_index 1.4.6 and others
|
||||
- Use all rewriting stuff from zimscraperlib
|
||||
- Remove most HTML / CSS / JS rewriting logic which is now part of zimscraperlib 5
|
||||
- Fix wombat setup settings (especially `isSW`) (#293)
|
||||
|
||||
### Fixed
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
FROM python:3.12-slim-bookworm
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/warc2zim
|
||||
LABEL org.opencontainers.image.source=https://github.com/openzim/warc2zim
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
|
@ -12,15 +12,13 @@ RUN apt-get update -y \
|
|||
WORKDIR /output
|
||||
|
||||
# Copy pyproject.toml and its dependencies
|
||||
COPY pyproject.toml openzim.toml README.md /src/
|
||||
COPY rules/generate_rules.py /src/rules/generate_rules.py
|
||||
COPY pyproject.toml README.md /src/
|
||||
COPY src/warc2zim/__about__.py /src/src/warc2zim/__about__.py
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir /src
|
||||
|
||||
# Copy code + associated artifacts
|
||||
COPY rules /src/rules
|
||||
COPY src /src/src
|
||||
COPY *.md /src/
|
||||
|
||||
|
|
21
README.md
21
README.md
|
@ -168,26 +168,13 @@ Start a hatch shell: this will install software including dependencies in an iso
|
|||
hatch shell
|
||||
```
|
||||
|
||||
### Regenerate wombatSetup.js
|
||||
### Rewriting logic and rewriting rules
|
||||
|
||||
wombatSetup.js is the JS code used to setup wombat when the ZIM is used.
|
||||
Mostly all rewriting logic and rewriting rules now comes from the [python-scraperlib](https://github.com/openzim/python-scraperlib/).
|
||||
|
||||
It is normally retrieved by Python build process (see openzim.toml for details).
|
||||
Should you need to add more rules or modify rewriting logic, this is the place to go.
|
||||
|
||||
Recommended solution to develop this JS code is to install Node.JS on your system, and then
|
||||
|
||||
```bash
|
||||
cd javascript
|
||||
yarn build-dev # or yarn build-prod
|
||||
```
|
||||
|
||||
Should you want to regenerate this code without install Node.JS, you might simply run following command.
|
||||
|
||||
```bash
|
||||
docker run -v $PWD/src/warc2zim/statics:/output -v $PWD/rules:/src/rules -v $PWD/javascript:/src/javascript -v $PWD/build_js.sh:/src/build_js.sh -it --rm --entrypoint /src/build_js.sh node:20-bookworm
|
||||
```
|
||||
|
||||
It will install Python3 on-top of Node.JS in a Docker container, generate JS fuzzy rules and bundle JS code straight to `/src/warc2zim/statics/wombatSetup.js` where the file is expected to be placed.
|
||||
All resulting code (Python and Javascript) as well as wombat.js and wombat-setup.js comes from the python-scraperlib.
|
||||
|
||||
## License
|
||||
|
||||
|
|
26
build_js.sh
26
build_js.sh
|
@ -1,26 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Custom script to install Python on top of a Docker Node-JS image, then install
|
||||
# required Python deps, generate fuzzy rules, and finally bundle JS script
|
||||
|
||||
apt-get update -y
|
||||
|
||||
apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-venv
|
||||
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
python3 -m venv /local
|
||||
|
||||
/local/bin/python -m pip install --no-cache-dir -U \
|
||||
pip \
|
||||
jinja2==3.1.4 \
|
||||
PyYAML==6.0.2
|
||||
|
||||
/local/bin/python /src/rules/generate_rules.py
|
||||
|
||||
cd /src/javascript
|
||||
|
||||
yarn install
|
||||
|
||||
OUTPUT_DIR=/output yarn build-prod
|
|
@ -1,84 +0,0 @@
|
|||
# Functional architecture
|
||||
|
||||
## Foreword
|
||||
|
||||
At a high level, warc2zim is a piece of software capable to transform a set of WARC files into one ZIM file. From a functional point of view, it is hence a "format converter".
|
||||
|
||||
While warc2zim is typically used as a sub-component of zimit, where WARC files are produced by Browsertrix crawler, it is in fact agnostic of this fact and could process any WARC file adhering to the standard.
|
||||
|
||||
This documentation will describe the big functions achieved by warc2zim codebase. It is important to note that these functions are not seggregated inside the codebase with frontiers.
|
||||
|
||||
## ZIM storage
|
||||
|
||||
While storing the web resources in the ZIM is mostly straightforward (we just transfer the raw bytes, after some modification for URL rewriting if needed), the decision of the path where the resource will be stored is very important.
|
||||
|
||||
This is purely conventional, even if ZIM specification has to be respected for proper operation in readers.
|
||||
|
||||
This function is responsible to compute the ZIM path where a given web resource is going to be stored.
|
||||
|
||||
While the URL is the only driver of this computation for now, warc2zim might have to consider other contextual data in the future. E.g. the resource to serve might by dynamic, depending not only on URL query parameters but also header(s) value(s).
|
||||
|
||||
## Fuzzy rules
|
||||
|
||||
Unfortunately, it is not always possible / desirable to store the resource with a simple transformation.
|
||||
|
||||
A typical situation is that some query parameters are dynamically computed by some Javascript code to include user tracking identifier, current datetime information, ...
|
||||
|
||||
When running again the same javascript code inside the ZIM, the URL will hence be slightly different because context has changed, but the same content needs to be retrieved.
|
||||
|
||||
warc2zim hence relies on fuzzy rules to transform/simplify some URLs when computing the ZIM path.
|
||||
|
||||
## URL Rewriting
|
||||
|
||||
warc2zim transforms (rewrites) URLs found in documents (HTML, CSS, JS, ...) so that they are usable inside the ZIM.
|
||||
|
||||
### General case
|
||||
|
||||
One simple example is that we might have following code in an HTML document to load an image with an absolute URL:
|
||||
|
||||
```
|
||||
<img src="https://en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg"></img>
|
||||
```
|
||||
|
||||
The URL `https://en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` has to be transformed to a URL that it is usable inside the ZIM.
|
||||
|
||||
For proper reader operation, openZIM prohibits using absolute URLs, so this has to be a relative URL. This relative URL is hence dependant on the location of the resource currently being rewriten.
|
||||
|
||||
The table below gives some examples of what the rewritten URL is going to be, depending on the URL of the rewritten document.
|
||||
|
||||
| HTML document URL | image URL rewritten for usage inside the ZIM |
|
||||
|--|--|
|
||||
| `https://en.wikipedia.org/wiki/Kiwix` | `./File:Kiwix_logo_v3.svg` |
|
||||
| `https://en.wikipedia.org/wiki` | `./wiki/File:Kiwix_logo_v3.svg` |
|
||||
| `https://en.wikipedia.org/waka/Kiwix` | `../wiki/File:Kiwix_logo_v3.svg` |
|
||||
| `https://fr.wikipedia.org/wiki/Kiwix` | `../../en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` |
|
||||
|
||||
As can be seen on the last line (but this is true for all URLs), this rewriting has to take into account the convention saying at which ZIM path a given web resource will be stored.
|
||||
|
||||
### Dynamic case
|
||||
|
||||
The explanation above more or less assumed that the transformations can be done statically, i.e warc2zim can open every known document, find existing URLs and replace them with their counterpart inside the ZIM.
|
||||
|
||||
While this is possible for HTML and CSS documents typically, it is not possible when the URL is dynamically computed. This is typically the case for JS documents, where in the general case the URL is not statically stored inside the JS code but computed on-the-fly by aggregating various strings and values.
|
||||
|
||||
Rewriting these computations is not deemed feasible due to the huge variety of situation which might be encountered.
|
||||
|
||||
A specific function is hence needed to rewrite URL **live in client browser**, intercept any function triggering a web request, transform the URL according to conventions (where we expect the resource to be located in the general case) and fuzzy rules.
|
||||
|
||||
_Spoiler: this is where we will rely on wombat.js from webrecorder team, since this dynamic interception is quite complex and already done quite neatly by them_
|
||||
|
||||
### Fuzzy rules
|
||||
|
||||
The same fuzzy rules that have been used to compute the ZIM path from a resource URL have to be applied again when rewriting URLs.
|
||||
|
||||
While this is expected to serve mostly for the dynamic case, we still applies them on both side (staticaly and dynamicaly) for coherency.
|
||||
|
||||
## Documents rewriten statically
|
||||
|
||||
For now warc2zim rewrites HTML, CSS and JS documents. For CSS and JS, this mainly consists in replacing URLs. For HTML, we also have more specific rewritting necessary (e.g. to handle base href or redirects with meta).
|
||||
|
||||
Since 2.1, no domain specific (DS) rules are applied like it is done in wabac.JS because these rules are already applied in Browsertrix Crawler. For the same reason, JSON is not rewritten anymore (URL do not need to be rewritten in JSON because these URLs will be used by JS, intercepted by wombat and dynamically rewritten).
|
||||
|
||||
JSONP callbacks are supposed to be rewritten but this has not been heavily tested.
|
||||
|
||||
Other types of documents are supposed to be either not feasible / not worth it (e.g. URLs inside PDF documents), meaningless (e.g. images, fonts) or planned for later due to limited usage in the wild (e.g. XML).
|
|
@ -1,48 +0,0 @@
|
|||
# Software architecture
|
||||
|
||||
## HTML rewriting
|
||||
|
||||
HTML rewriting is purely static (i.e. before resources are written to the ZIM). HTML code is parsed with the [HTML parser from Python standard library](https://docs.python.org/3/library/html.parser.html).
|
||||
|
||||
A small header script is inserted in HTML code to initialize wombat.js which will wrap all JS APIs to dynamically rewrite URLs comming from JS.
|
||||
|
||||
This header script is generated using [Jinja2](https://pypi.org/project/Jinja2/) template since it needs to populate some JS context variables needed by wombat.js operations (original scheme, original url, ...).
|
||||
|
||||
## CSS rewriting
|
||||
|
||||
CSS rewriting is purely static (i.e. before resources are written to the ZIM). CSS code is parsed with the [tinycss2 Python library](https://pypi.org/project/tinycss2/).
|
||||
|
||||
## JS rewriting
|
||||
|
||||
### Static
|
||||
|
||||
Static JS rewriting is simply a matter of pure textual manipulation with regular expressions. No parsing is done at all.
|
||||
|
||||
### Dynamic
|
||||
|
||||
Dynamic JS rewriting is done with [wombat JS library](https://github.com/webrecorder/wombat). The same fuzzy rules that are used for static rewritting are injected into wombat configuration. Code to rewrite URLs is an adapted version of the code used to compute ZIM paths.
|
||||
|
||||
For wombat setup, including the URL rewriting part, we need to pass wombat configuration info. This code is developed in the `javascript` folder. For URL parsing, it relies on the [uri-js library](https://www.npmjs.com/package/uri-js). This javascript code is bundled into a single `wombatSetup.js` file with [rollup bundler](https://rollupjs.org), the same bundler used by webrecorder team to bundle wombat.
|
||||
|
||||
## cdxj_indexer and warcio
|
||||
|
||||
[cdxj_indexer Python library](https://pypi.org/project/cdxj-indexer/) is a thin wrapper over [warcio Python library](https://pypi.org/project/warcio/). It used to iterate all record in WARCs.
|
||||
|
||||
It provide two main features:
|
||||
|
||||
- Loop over several WARCs in a directory (A visit of a website may be stored in several WARCs in the same directory).
|
||||
- Provide a buffered access to warcs content (and not a "stream" (fileio) only api) (but monkey patching returned WarcRecord.
|
||||
|
||||
Except that, scraper directly uses WarcRecord (returned by cdxj_indexer, implemented in warcio) to access metadata and such.
|
||||
|
||||
## zimscraperlib
|
||||
|
||||
[zimscraperlib Python library](https://pypi.org/project/zimscraperlib) is used for ZIM operations.
|
||||
|
||||
## requests
|
||||
|
||||
[requests Python library](https://pypi.org/project/requests/) is used to retrieve the custom CSS file when a URL is passed.
|
||||
|
||||
## brotlipy
|
||||
|
||||
[brotlipy Python library](https://pypi.org/project/brotlipy/) is used to access brotli content in WARC records (not part of warcio because it is an optional dependency).
|
|
@ -1,100 +0,0 @@
|
|||
# Technical architecture
|
||||
|
||||
## Fuzzy rules
|
||||
|
||||
Fuzzy rules are stored in `rules/rules.yaml`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code.
|
||||
|
||||
Should you update these fuzzy rules, you hence have to:
|
||||
- regenerate Python and JS files by running `python rules/generateRules.py`
|
||||
- bundle again Javascript `wombatSetup.js` (see below).
|
||||
|
||||
## Wombat configuration
|
||||
|
||||
Wombat configuration contains some static configuration and the dynamic URL rewriting, including fuzzy rules.
|
||||
|
||||
It is bundled by rollup with `cd javascript && yarn build-prod` and the result is pushed to proper scraper location for inclusion at build time.
|
||||
|
||||
Tests are available and run with `cd javascript && yarn test`.
|
||||
|
||||
## Scraper operations
|
||||
|
||||
### High level overview
|
||||
|
||||
The scraper behavior is done in two phases.
|
||||
|
||||
First the WARC records are iterated to compute the ZIM metadata (find main path, favicon, ...) and detect which ZIM paths are expected to be populated. This is mandatory to know when we will rewrite the documents if the URLs we will encounter leads to something which is internal (inside the ZIM) and should be rewriten or external and should be kept as-is.
|
||||
|
||||
Second, the WARC records are iterated to be transformed and appended inside the ZIM. ZIM records are appended to the ZIM on the fly.
|
||||
|
||||
In both phases, WARC records are iterated in natural order, i.e. as they have been retrieved online during the crawl.
|
||||
|
||||
### Transformation of URL into ZIM path
|
||||
|
||||
Transforming a URL into a ZIM path has to respect the ZIM specification: path must not be url-encoded (i.e. it must be decoded) and it must be stored as UTF-8.
|
||||
|
||||
WARC record stores the items URL inside a header named "WARC-Target-URI". The value inside this header is encoded, or more exactly it is "exactly what the browser sent at the HTTP level" (see https://github.com/webrecorder/browsertrix-crawler/issues/492 for more details).
|
||||
|
||||
It has been decided (by convention) that we will drop the scheme, the port, the username and password from the URL. Headers are also not considered in this computation.
|
||||
|
||||
Computation of the ZIM path is hence mostly straightforward:
|
||||
- decode the hostname which is puny-encoded
|
||||
- decode the path and query parameter which might be url-encoded
|
||||
|
||||
## Rewriting documents
|
||||
|
||||
Some documents (HTML, CSS, JS and JSON for now) needs to be rewritten, e.g. to rewrite URLs, adapt some code to the ZIM context, ...
|
||||
|
||||
The first important step when processing a WARC entry to add it as a ZIM entry is hence to properly detect which kind of document we are dealing with.
|
||||
|
||||
This is done in the `get_rewrite_mode` function of the `Rewriter` class. Before 2.0.1, scraper was relying only on mimetype as returned in `Content-Type` HTTP response.
|
||||
|
||||
Unfortunately, this caused problems where some server are returning wrong information is this header, e.g. Cloudflare seems to frequently return `text/html` for woff2 fonts ; this causes the scraper to fail, because it is impossible to know in advance that we should ignore these errors, we could have a real document which should be rewriten but is failing.
|
||||
|
||||
Since 2.0.1, we've enriched the logic by using the new WARC header `WARC-Resource-Type` which contains the type of resources "as perceived by the browser" (from https://chromedevtools.github.io/devtools-protocol/tot/Network/#type-ResourceType, see https://github.com/webrecorder/browsertrix-crawler/pull/481). Unfortunately this information is not sufficient because of some very generic value returned like `fetch` or `xhr`. Scraper stills need to mix this information with the mimetype. Ideally, we would have prefer to find a single source of truth not relying on something returned by the server, but it is not available for now (see https://github.com/openzim/warc2zim/issues/340 for a discussion on this topic).
|
||||
|
||||
### URL rewriting
|
||||
|
||||
In addition to the computation of the relative path from the current document URL to the URL to rewrite, URL rewriting also consists in computing the proper ZIM path (with same operation as above) and properly encoding it so that the resulting URL respects [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986). Some important stuff has to be noted in this encoding.
|
||||
|
||||
- since the original hostname is now part of the path, it will now be url-encoded
|
||||
- since the `?` and following query parameters are also part of the path (we do not want readers to drop them like kiwix-serve would do), they are also url-encoded
|
||||
|
||||
Below is an example case of the rewrite operation on an image URL found in an HTML document.
|
||||
|
||||
- Document original URL: `https://kiwix.org/a/article/document.html`
|
||||
- Document ZIM path: `kiwix.org/a/article/document.html`
|
||||
- Image original URL: `//xn--exmple-cva.com/a/resource/image.png?foo=bar`
|
||||
- Image rewritten URL: `../../../ex%C3%A9mple.com/a/resource/image.png%3Ffoo%3Dbar`
|
||||
- Image ZIM Path: `exémple.com/a/resource/image.png?foo=bar`
|
||||
|
||||
### JS Rewriting
|
||||
|
||||
JS Rewriting is a bit special because rules to apply are different wether we are using "classic" Javascript or "module" Javascript.
|
||||
|
||||
Detection of Javascript modules starts at the HTML level where we have a `<script type="module" src="...">` tag. This tells us that file at src location is a Javascript module. From there we now that its subresources are also Javascript module.
|
||||
|
||||
Currently this detection is done on-the-fly, based on the fact that WARC items are processed in the same order that they have been fetched by the browser, and we hence do not need a multi-pass approach. Meaning that HTML will be processed first, then parent JS, then its dependencies, ... **This is a strong assumption**.
|
||||
|
||||
### Different kinds of WARC records
|
||||
|
||||
The WARC to ZIM conversion is performed by transforming WARC records into ZIM records.
|
||||
|
||||
For `response` records, the rewritten payload (only, without HTTP headers) is stored inside the ZIM.
|
||||
|
||||
If the payload is zero-length, the record is omitted to conform to ZIM specifications of not storing empty records.
|
||||
|
||||
For `request` and `resource` records, they are simply ignored. These records do not convey important information for now.
|
||||
|
||||
**TODO** better explain what `request` and `resource` records are and why they might point to a different URL.
|
||||
|
||||
For `revisit` records, a ZIM alias is created if the revisit points to a diferrent URL.
|
||||
|
||||
**TODO** better explain what `revisit` records are and why they might point to a different URL.
|
||||
|
||||
### Duplicate URIs
|
||||
|
||||
WARCs allow multiple records for the same URL, while ZIM does not. As a result, only the first encountered response or resource record is stored in the ZIM, and subsequent records are ignored.
|
||||
|
||||
For revisit records, they are only added as a ZIM alias if pointing to a different URL, and are processed after response records. A revisit record to the same URL will always be ignored.
|
||||
|
||||
All other WARC records are skipped.
|
|
@ -1,2 +0,0 @@
|
|||
src/fuzzyRules.js
|
||||
test/fuzzyRules.js
|
|
@ -1,3 +0,0 @@
|
|||
{
|
||||
"singleQuote": true
|
||||
}
|
|
@ -1,7 +0,0 @@
|
|||
export default [
|
||||
{
|
||||
rules: {
|
||||
'prefer-const': 'error',
|
||||
},
|
||||
},
|
||||
];
|
|
@ -1,43 +0,0 @@
|
|||
{
|
||||
"name": "@openzim/wombat-setup",
|
||||
"type": "module",
|
||||
"version": "2.1.3-dev0",
|
||||
"license": "GPL-3.0-or-later",
|
||||
"author": "openZIM",
|
||||
"devDependencies": {
|
||||
"@rollup/plugin-commonjs": "26.0.1",
|
||||
"@rollup/plugin-node-resolve": "15.2.3",
|
||||
"@rollup/plugin-terser": "0.4.4",
|
||||
"ava": "^6.1.3",
|
||||
"eslint": "9.9.1",
|
||||
"eslint-config-prettier": "9.1.0",
|
||||
"prettier": "3.3.3",
|
||||
"rollup": "4.21.2"
|
||||
},
|
||||
"scripts": {
|
||||
"prettier-check": "prettier . --check",
|
||||
"prettier-fix": "prettier . --write",
|
||||
"eslint": "eslint .",
|
||||
"test": "ava --verbose",
|
||||
"build-prod": "rollup -c rollup.config.js",
|
||||
"build-dev": "DEV=1 rollup -c rollup.config.js",
|
||||
"build-dev-watch": "DEV=1 rollup --watch -c rollup.config.js"
|
||||
},
|
||||
"prettier": {
|
||||
"singleQuote": true
|
||||
},
|
||||
"ava": {
|
||||
"concurrency": 1,
|
||||
"verbose": true,
|
||||
"serial": true,
|
||||
"files": [
|
||||
"test/*.js"
|
||||
],
|
||||
"sources": [
|
||||
"src/**/*"
|
||||
]
|
||||
},
|
||||
"dependencies": {
|
||||
"uri-js": "^4.4.1"
|
||||
}
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
import path from 'path';
|
||||
import url from 'url';
|
||||
|
||||
import { nodeResolve } from '@rollup/plugin-node-resolve'; // used to bundle node_modules code
|
||||
import commonjs from '@rollup/plugin-commonjs'; // used to bundle CommonJS node_modules
|
||||
import terser from '@rollup/plugin-terser'; // used to minify JS code
|
||||
|
||||
const __filename = url.fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
const outputDir =
|
||||
process.env.OUTPUT_DIR || path.join(__dirname, '../src/warc2zim/statics');
|
||||
|
||||
const noStrict = {
|
||||
renderChunk(code) {
|
||||
return code.replace("'use strict';", '');
|
||||
},
|
||||
};
|
||||
|
||||
const watchOptions = {
|
||||
exclude: 'node_modules/**',
|
||||
chokidar: {
|
||||
alwaysStat: true,
|
||||
usePolling: true,
|
||||
},
|
||||
};
|
||||
|
||||
const plugins = [nodeResolve({ preferBuiltins: false }), commonjs(), noStrict];
|
||||
if (!process.env.DEV) {
|
||||
plugins.push(terser());
|
||||
}
|
||||
|
||||
export default {
|
||||
input: 'src/wombatSetup.js',
|
||||
output: {
|
||||
name: 'wombatSetup',
|
||||
file: path.join(outputDir, 'wombatSetup.js'),
|
||||
sourcemap: false,
|
||||
format: 'iife',
|
||||
exports: 'named',
|
||||
},
|
||||
watch: watchOptions,
|
||||
plugins: plugins,
|
||||
};
|
|
@ -1,313 +0,0 @@
|
|||
import { fuzzyRules } from './fuzzyRules.js';
|
||||
import URI from 'uri-js';
|
||||
|
||||
export function applyFuzzyRules(path) {
|
||||
// Apply fuzzy rules to simplify the ZIM path. First matching rule is applied and
|
||||
// result is immediately returned
|
||||
|
||||
for (const rule of fuzzyRules) {
|
||||
const new_path = path.replace(new RegExp(rule.match), rule.replace);
|
||||
if (new_path != path) {
|
||||
return new_path;
|
||||
}
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
export function hasAlreadyBeenRewritten(
|
||||
original_absolute_url,
|
||||
orig_url,
|
||||
uri,
|
||||
url,
|
||||
) {
|
||||
// Detect (with a heuristic) that the path is most probably already rewritten and
|
||||
// must be kept as-is. We just need to detect relative links (all statically rewritten
|
||||
// links are relative) and contains a path including the hostname (which cannot be
|
||||
// joined with the orig_url since if it includes the hostname, it means it is in
|
||||
// another hostname than orig_url and will hence go one level too high in the path
|
||||
// hierarchy, hence working only on ZIM paths / relative links).
|
||||
// The heurisitic is:
|
||||
// - the link must be relative and start by going at least one level up
|
||||
// - the first non relative part of the path (i.e. not . or ..) looks like a hostname
|
||||
// (i.e. it contains a dot)
|
||||
// - the relative link, when merged with orig_url, is going exactly one "path level"
|
||||
// too high in the hierarchy
|
||||
if (typeof uri.scheme == 'undefined' && url.startsWith('../')) {
|
||||
const urlParts = url.split('/');
|
||||
const original_absolute_url1 = URI.resolve(
|
||||
orig_url,
|
||||
urlParts.slice(1).join('/'),
|
||||
);
|
||||
const original_absolute_url2 = URI.resolve(
|
||||
orig_url,
|
||||
urlParts.slice(2).join('/'),
|
||||
);
|
||||
// detect that relative link is going exactly one "path level" too high
|
||||
if (
|
||||
original_absolute_url1 == original_absolute_url &&
|
||||
original_absolute_url2 != original_absolute_url
|
||||
) {
|
||||
const firstNonRelativePart = urlParts.find((urlPart) => urlPart !== '..');
|
||||
// detect that first non relative part of the path looks like a hostname
|
||||
if (firstNonRelativePart.indexOf('.') > -1) {
|
||||
// if all 3 conditions are true, then we assume it has already been rewritten
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// otherwise we don't know and assume it can be safely rewritten
|
||||
return false;
|
||||
}
|
||||
|
||||
function removeSubsequentSlashes(value) {
|
||||
// Remove all successive occurrences of a slash `/` in a given string
|
||||
// E.g `val//ue` or `val///ue` or `val////ue` (and so on) are transformed into `value`
|
||||
return value.replace(/\/\/+/g, '/');
|
||||
}
|
||||
|
||||
export function urlRewriteFunction(
|
||||
current_url, // The current (real) url we are on, e.g. http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/index.html
|
||||
orig_host, // The host of the original url, e.g. www.example.com
|
||||
orig_scheme, // The scheme of the original url, e.g. https
|
||||
orig_url, // The original url, e.g. https://www.example.com/index.html
|
||||
prefix, // The (absolute) prefix to add to all our urls (from where we are served), e.g. http://library.kiwix.org/content/myzim_yyyy-mm/
|
||||
url, // first argument passed by wombat.JS at each invocation, current url to rewrite, e.g. http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/image.png
|
||||
useRel,
|
||||
mod,
|
||||
doc, // last argument passed by wombat.JS at each invocation
|
||||
) {
|
||||
if (!url) return url;
|
||||
|
||||
// Transform URL which might be an object (detected on Chromium browsers at least)
|
||||
url = String(url);
|
||||
|
||||
// Special stuff which is not really a URI but exists in the wild
|
||||
if (['#', '{', '*'].includes(url.substring(0, 1))) return url;
|
||||
|
||||
// If URI scheme is defined but not http or https, we have to not rewrite the URL
|
||||
const uri = URI.parse(url);
|
||||
if (
|
||||
typeof uri.scheme !== 'undefined' &&
|
||||
!['http', 'https'].includes(uri.scheme)
|
||||
)
|
||||
return url;
|
||||
|
||||
// If url starts with prefix, we need to remove this prefix before applying usual
|
||||
// rewrite rules
|
||||
if (url.startsWith(prefix)) {
|
||||
url = uri.scheme + '://' + url.substring(prefix.length);
|
||||
}
|
||||
|
||||
// This is a hack to detect improper URL encoding ; proper detection should be
|
||||
// possible with chardet or other alternatives but did not worked so far ; we hence
|
||||
// take benefit of the error below to detect improper URL encoding
|
||||
// When improper URL encoding is detected, we try to encode URL as a best-effort;
|
||||
// 'best-effort', because if some part of the URL is encoded and another part is not,
|
||||
// this will fail ... but this is a weird edge case anyway
|
||||
try {
|
||||
decodeURIComponent(URI.parse(url).path);
|
||||
} catch (e) {
|
||||
url = encodeURI(url);
|
||||
}
|
||||
|
||||
// Compute the absolute URI, just like the browser would have resolved it hopefully
|
||||
// We need to use the original URL for that to properly detect the hostname when
|
||||
// present ; current URL does not allow to do it easily
|
||||
const original_absolute_url = URI.resolve(orig_url, url);
|
||||
|
||||
// Detect if url has probably already been rewritten and return as-is in such a case
|
||||
if (hasAlreadyBeenRewritten(original_absolute_url, orig_url, uri, url)) {
|
||||
return url;
|
||||
}
|
||||
|
||||
// Detect (with a heuristic) that the path is most probably already rewritten and
|
||||
// must be kept as-is. We just need to detect relative links (all statically rewritten
|
||||
// links are relative) and contains a path including the hostname (which cannot be
|
||||
// joined with the orig_url since if it includes the hostname, it means it is in
|
||||
// another hostname than orig_url and will hence go one level too high in the path
|
||||
// hierarchy, hence working only on ZIM paths / relative links).
|
||||
// The heurisitic is:
|
||||
// - the link must be relative and start by going at least one level up
|
||||
// - the first non relative part of the path (i.e. not . or ..) looks like a hostname
|
||||
// (i.e. it contains a dot)
|
||||
// - the relative link, when merged with orig_url, is going exactly one "path level"
|
||||
// too high in the hierarchy
|
||||
if (typeof uri.scheme == 'undefined' && url.startsWith('../')) {
|
||||
const urlParts = url.split('/');
|
||||
const original_absolute_url1 = URI.resolve(
|
||||
orig_url,
|
||||
urlParts.slice(1).join('/'),
|
||||
);
|
||||
const original_absolute_url2 = URI.resolve(
|
||||
orig_url,
|
||||
urlParts.slice(2).join('/'),
|
||||
);
|
||||
// detect that relative link is going exactly one "path level" too high
|
||||
if (
|
||||
original_absolute_url1 == original_absolute_url &&
|
||||
original_absolute_url2 != original_absolute_url
|
||||
) {
|
||||
const firstNonRelativePart = urlParts.find((urlPart) => urlPart !== '..');
|
||||
// detect that first non relative part of the path looks like a hostname
|
||||
if (firstNonRelativePart.indexOf('.') > -1) {
|
||||
// if all 3 conditions are true, then we do not rewrite the link at all,
|
||||
// otherwise we continue with normal rewritting
|
||||
return url;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We now have to transform this absolute URI into a normalized ZIM path entry
|
||||
const absolute_url_parts = URI.parse(original_absolute_url);
|
||||
|
||||
// Let's first compute the decoded host
|
||||
const serialized_host = URI.serialize(
|
||||
URI.parse('http://' + absolute_url_parts.host), // fake URI to benefit from decoding
|
||||
{ iri: true }, // decode potentially puny-encoded host
|
||||
);
|
||||
const decoded_host = serialized_host.substring(7, serialized_host.length - 1);
|
||||
|
||||
// And the decoded path, only exception is that an empty path must resolve to '/' path
|
||||
// (our convention, just like in Python)
|
||||
const decoded_path =
|
||||
!absolute_url_parts.path || absolute_url_parts.path.length === 0
|
||||
? '/'
|
||||
: decodeURIComponent(absolute_url_parts.path);
|
||||
|
||||
// And the decoded query, only exception is that + sign must resolve to ' ' to avoid
|
||||
// confusion (our convention, just like in Python)
|
||||
const decoded_query =
|
||||
!absolute_url_parts.query || absolute_url_parts.query.length === 0
|
||||
? ''
|
||||
: '?' + decodeURIComponent(absolute_url_parts.query).replaceAll('+', ' ');
|
||||
|
||||
// combine all decoded parts to get the ZIM path
|
||||
const zimPath =
|
||||
decoded_host + removeSubsequentSlashes(decoded_path + decoded_query);
|
||||
|
||||
// apply the fuzzy rules to the ZIM path
|
||||
const fuzzifiedPath = applyFuzzyRules(zimPath);
|
||||
|
||||
// Reencode everything but '/' (we decode it afterwards for simplicity)
|
||||
const finalUrl =
|
||||
prefix + encodeURIComponent(fuzzifiedPath).replaceAll('%2F', '/');
|
||||
|
||||
console.debug(
|
||||
'urlRewriten:\n\t- current_url: ' +
|
||||
current_url +
|
||||
'\n\t- orig_host: ' +
|
||||
orig_host +
|
||||
'\n\t- orig_scheme: ' +
|
||||
orig_scheme +
|
||||
'\n\t- orig_url: ' +
|
||||
orig_url +
|
||||
'\n\t- prefix: ' +
|
||||
prefix +
|
||||
'\n\t- url: ' +
|
||||
url +
|
||||
'\n\t- useRel: ' +
|
||||
useRel +
|
||||
'\n\t- mod: ' +
|
||||
mod +
|
||||
'\n\t- doc: ' +
|
||||
doc +
|
||||
'\n\t- finalUrl: ' +
|
||||
finalUrl.toString() +
|
||||
'\n\t',
|
||||
);
|
||||
|
||||
return finalUrl;
|
||||
}
|
||||
|
||||
export function getWombatInfo(
|
||||
current_url, // The current (real) url we are on
|
||||
orig_host, // The host of the original url
|
||||
orig_scheme, // The scheme of the original url
|
||||
orig_url, // The original url
|
||||
prefix, // The (absolute) prefix to add to all our urls (from where we are served))
|
||||
) {
|
||||
return {
|
||||
// The rewrite function used to rewrite our urls.
|
||||
rewrite_function: (url, useRel, mod, doc) =>
|
||||
urlRewriteFunction(
|
||||
current_url,
|
||||
orig_host,
|
||||
orig_scheme,
|
||||
orig_url,
|
||||
prefix,
|
||||
url,
|
||||
useRel,
|
||||
mod,
|
||||
doc,
|
||||
),
|
||||
|
||||
// Seems to be used only to send message to. We don't care ?
|
||||
top_url: current_url,
|
||||
|
||||
// Seems to be used to generate url for blobUrl returned by SW.
|
||||
// We don't care (?)
|
||||
url: orig_url,
|
||||
|
||||
// Use to timestamp message send to top frame. Don't care
|
||||
timestamp: '',
|
||||
|
||||
// Use to send message to top frame and in default rewrite url function. Don't care
|
||||
request_ts: '',
|
||||
|
||||
// The url on which we are served.
|
||||
prefix: prefix,
|
||||
|
||||
// The default mod to use.
|
||||
mod: '',
|
||||
|
||||
// Use to detect if we are framed (and send message to top frame ?)
|
||||
is_framed: false,
|
||||
|
||||
// ??
|
||||
is_live: false,
|
||||
|
||||
// Never used ?
|
||||
coll: '',
|
||||
|
||||
// Set wombat if is proxy mode (we are not)
|
||||
proxy_magic: '',
|
||||
|
||||
// This is the prefix on which we have stored our static files (needed by wombat).
|
||||
// Must not conflict with other url served.
|
||||
// Will be used by wombat to not rewrite back the url
|
||||
static_prefix: prefix + '_zim_static/',
|
||||
|
||||
wombat_ts: '',
|
||||
|
||||
// A delay in sec to apply to all js time (`Date.now()`, ...)
|
||||
wombat_sec: 0,
|
||||
|
||||
// The scheme of the original url
|
||||
wombat_scheme: orig_scheme,
|
||||
|
||||
// The host of the original url
|
||||
wombat_host: orig_host,
|
||||
|
||||
// We are not running inside a service worker, wombat needs to know about it since
|
||||
// some "magic" URLs like blobs are not available
|
||||
isSW: false,
|
||||
|
||||
// Convert all post request to get request
|
||||
convert_post_to_get: true,
|
||||
|
||||
// Not used, we are not replaying in a frame
|
||||
target_frame: '___wb_replay_top_frame',
|
||||
|
||||
// Not used, we are not running in live mode
|
||||
enable_auto_fetch: false,
|
||||
|
||||
// Extra options, not used
|
||||
wombat_opts: {},
|
||||
};
|
||||
}
|
||||
|
||||
export default {
|
||||
applyFuzzyRules: applyFuzzyRules,
|
||||
urlRewriteFunction: urlRewriteFunction,
|
||||
getWombatInfo: getWombatInfo,
|
||||
};
|
|
@ -1,42 +0,0 @@
|
|||
import test from 'ava';
|
||||
|
||||
import utils from '../src/wombatSetup.js';
|
||||
|
||||
test.beforeEach((t) => {
|
||||
t.context.prefix = 'http://library.kiwix.org/content/myzim_yyyy-mm/';
|
||||
t.context.originalHost = 'www.example.com';
|
||||
t.context.originalScheme = 'https';
|
||||
});
|
||||
|
||||
test('nominalWbInfo', (t) => {
|
||||
const path = 'path1/resource1.js';
|
||||
const originalUrl =
|
||||
t.context.originalScheme + '://' + t.context.originalHost + '/' + path;
|
||||
const wmInfo = utils.getWombatInfo(
|
||||
t.context.prefix + path,
|
||||
t.context.originalHost,
|
||||
t.context.originalScheme,
|
||||
originalUrl,
|
||||
t.context.prefix,
|
||||
);
|
||||
t.is(wmInfo.coll, '');
|
||||
t.is(wmInfo.convert_post_to_get, true);
|
||||
t.is(wmInfo.enable_auto_fetch, false);
|
||||
t.is(wmInfo.isSW, false);
|
||||
t.is(wmInfo.is_framed, false);
|
||||
t.is(wmInfo.is_live, false);
|
||||
t.is(wmInfo.mod, '');
|
||||
t.is(wmInfo.prefix, t.context.prefix);
|
||||
t.is(wmInfo.proxy_magic, '');
|
||||
t.is(wmInfo.request_ts, '');
|
||||
t.is(wmInfo.static_prefix, t.context.prefix + '_zim_static/');
|
||||
t.is(wmInfo.target_frame, '___wb_replay_top_frame');
|
||||
t.is(wmInfo.timestamp, '');
|
||||
t.is(wmInfo.top_url, t.context.prefix + path);
|
||||
t.is(wmInfo.url, originalUrl);
|
||||
t.is(wmInfo.wombat_host, t.context.originalHost);
|
||||
t.deepEqual(wmInfo.wombat_opts, {});
|
||||
t.is(wmInfo.wombat_scheme, t.context.originalScheme);
|
||||
t.is(wmInfo.wombat_sec, 0);
|
||||
t.is(wmInfo.wombat_ts, '');
|
||||
});
|
File diff suppressed because it is too large
Load diff
2069
javascript/yarn.lock
2069
javascript/yarn.lock
File diff suppressed because it is too large
Load diff
15
openzim.toml
15
openzim.toml
|
@ -1,15 +0,0 @@
|
|||
[files.assets.config]
|
||||
target_dir="src/warc2zim/statics"
|
||||
execute_after=[
|
||||
"cd ../../../ && python rules/generate_rules.py", # generate Python (and JS) rules
|
||||
]
|
||||
|
||||
[files.assets.actions."wombat.js"]
|
||||
action="get_file"
|
||||
source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.8.6/dist/wombat.js"
|
||||
target_file="wombat.js"
|
||||
|
||||
[files.assets.actions."wombatSetup.js"] # fallback if this script has not been properly build (should happen only in dev)
|
||||
action="get_file"
|
||||
source="https://dev.kiwix.org/warc2zim/wombatSetup.js"
|
||||
target_file="wombatSetup.js"
|
|
@ -1,7 +1,5 @@
|
|||
[build-system]
|
||||
# jinja2 is required to generate JS and Python rules at build time
|
||||
# PyYAML is used to parse fuzzy rules and generate Python/JS code
|
||||
requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4", "PyYAML==6.0.2"]
|
||||
requires = ["hatchling", "hatch-openzim==0.2.1"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
|
@ -10,15 +8,15 @@ requires-python = ">=3.12,<3.13"
|
|||
description = "Convert WARC to ZIM"
|
||||
readme = "README.md"
|
||||
dependencies = [
|
||||
"warcio==1.7.4",
|
||||
"warcio==1.7.5",
|
||||
"requests==2.32.3",
|
||||
"zimscraperlib==4.0.0",
|
||||
"zimscraperlib==5.0.0rc2",
|
||||
"jinja2==3.1.4", # also update version in build-system above and in build_js.sh
|
||||
# to support possible brotli content in warcs, must be added separately
|
||||
"brotlipy==0.7.0",
|
||||
"cdxj_indexer==1.4.5",
|
||||
"tinycss2==1.3.0",
|
||||
"beautifulsoup4==4.12.3", # used to parse base href
|
||||
"cdxj_indexer==1.4.6",
|
||||
"tinycss2==1.4.0",
|
||||
"beautifulsoup4==4.12.3", # used to parse base href
|
||||
"lxml==5.3.0", # used by beautifulsoup4 for parsing html
|
||||
"python-dateutil==2.9.0.post0",
|
||||
]
|
||||
|
@ -32,27 +30,24 @@ additional-keywords = ["warc"]
|
|||
name="Webrecorder Software"
|
||||
email="info@webrecorder.net"
|
||||
|
||||
[tool.hatch.build.hooks.openzim-build]
|
||||
|
||||
[project.optional-dependencies]
|
||||
scripts = [
|
||||
"invoke==2.2.0",
|
||||
"PyYAML==6.0.2", # used to parse fuzzy rules and generate Python/JS code ; also update version in build-system above and in build_js.sh
|
||||
]
|
||||
lint = [
|
||||
"black==24.10.0",
|
||||
"ruff==0.6.9",
|
||||
"ruff==0.8.4",
|
||||
]
|
||||
check = [
|
||||
"pyright==1.1.383",
|
||||
"pyright==1.1.391",
|
||||
]
|
||||
test = [
|
||||
"pytest==8.3.3",
|
||||
"coverage==7.6.1",
|
||||
"pytest==8.3.4",
|
||||
"coverage==7.6.9",
|
||||
]
|
||||
dev = [
|
||||
"pre-commit==4.0.0",
|
||||
"debugpy==1.8.6",
|
||||
"pre-commit==4.0.1",
|
||||
"debugpy==1.8.11",
|
||||
"warc2zim[scripts]",
|
||||
"warc2zim[lint]",
|
||||
"warc2zim[test]",
|
||||
|
@ -72,10 +67,6 @@ exclude = [
|
|||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/warc2zim"]
|
||||
artifacts = [
|
||||
"src/warc2zim/statics/**",
|
||||
"src/warc2zim/rules.py",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.default]
|
||||
features = ["dev"]
|
||||
|
@ -209,7 +200,6 @@ ban-relative-imports = "all"
|
|||
[tool.ruff.lint.per-file-ignores]
|
||||
# Tests can use magic values, assertions, and relative imports
|
||||
"tests/**/*" = ["PLR2004", "S101", "TID252"]
|
||||
"rules/generate_rules.py" = ["T201"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = "7.3"
|
||||
|
|
|
@ -1,174 +0,0 @@
|
|||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from jinja2 import Environment
|
||||
|
||||
rules_src = Path(__file__).with_name("rules.yaml")
|
||||
if not rules_src.exists():
|
||||
# This skip is usefull mostly for CI operations when installing only Python deps
|
||||
print("Skipping rules generation, rule file is missing")
|
||||
sys.exit()
|
||||
|
||||
FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"]
|
||||
|
||||
for rule in FUZZY_RULES:
|
||||
if "name" not in rule:
|
||||
raise SystemExit("Fuzzy rule is missing a name")
|
||||
if "tests" not in rule or len(rule["tests"]) == 0:
|
||||
raise SystemExit("Fuzzy rule is missing test cases")
|
||||
|
||||
|
||||
PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII)
|
||||
|
||||
# Do not escape anything, we want to generate code as-is, it won't be interpreted as
|
||||
# HTML anyway
|
||||
JINJA_ENV = Environment(autoescape=False) # noqa: S701
|
||||
|
||||
### Generate Javascript code
|
||||
|
||||
js_code_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
|
||||
|
||||
export const fuzzyRules = [
|
||||
{% for rule in FUZZY_RULES %} {
|
||||
match: '{{ rule['match'] }}',
|
||||
replace: '{{ rule['replace'] }}',
|
||||
},
|
||||
{% endfor %}
|
||||
];
|
||||
|
||||
"""
|
||||
|
||||
js_parent = Path(__file__).joinpath("../../javascript/src").resolve()
|
||||
if not js_parent.exists():
|
||||
# This skip is usefull mostly for CI operations when working on the Python part
|
||||
print("Skipping JS rules generation, target folder is missing")
|
||||
else:
|
||||
(js_parent / "fuzzyRules.js").write_text(
|
||||
JINJA_ENV.from_string(js_code_template).render(
|
||||
FUZZY_RULES=[
|
||||
{
|
||||
"match": rule["pattern"].replace("\\", "\\\\"),
|
||||
"replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]),
|
||||
}
|
||||
for rule in FUZZY_RULES
|
||||
]
|
||||
)
|
||||
)
|
||||
print("JS rules generation completed successfully")
|
||||
|
||||
### Generate Javascript tests
|
||||
|
||||
js_test_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
|
||||
|
||||
import test from 'ava';
|
||||
|
||||
import { applyFuzzyRules } from '../src/wombatSetup.js';
|
||||
|
||||
{% for rule in FUZZY_RULES %}
|
||||
{% for test in rule['tests'] %}
|
||||
test('fuzzyrules_{{rule['name']}}_{{loop.index}}', (t) => {
|
||||
t.is(
|
||||
applyFuzzyRules(
|
||||
'{{test['raw_url']}}',
|
||||
),
|
||||
'{{test['raw_url'] if test['unchanged'] else test['fuzzified_url']}}',
|
||||
);
|
||||
});
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
"""
|
||||
|
||||
js_parent = Path(__file__).joinpath("../../javascript/test").resolve()
|
||||
if not js_parent.exists():
|
||||
# This skip is usefull mostly for CI operations when working on the Python part
|
||||
print("Skipping JS tests generation, target folder is missing")
|
||||
else:
|
||||
(js_parent / "fuzzyRules.js").write_text(
|
||||
JINJA_ENV.from_string(js_test_template).render(
|
||||
FUZZY_RULES=[
|
||||
{
|
||||
"name": rule["name"],
|
||||
"tests": rule["tests"],
|
||||
"match": rule["pattern"].replace("\\", "\\\\"),
|
||||
"replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]),
|
||||
}
|
||||
for rule in FUZZY_RULES
|
||||
]
|
||||
)
|
||||
)
|
||||
print("JS tests generation completed successfully")
|
||||
|
||||
### Generate Python code
|
||||
|
||||
py_code_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
|
||||
|
||||
FUZZY_RULES = [
|
||||
{% for rule in FUZZY_RULES %} {
|
||||
"pattern": r"{{ rule['pattern'] }}",
|
||||
"replace": r"{{ rule['replace'] }}",
|
||||
},
|
||||
{% endfor %}
|
||||
]
|
||||
"""
|
||||
|
||||
py_parent = Path(__file__).joinpath("../../src/warc2zim").resolve()
|
||||
if not py_parent.exists():
|
||||
# This skip is usefull mostly for CI operations when working on the JS part
|
||||
print("Skipping Python rules generation, target folder is missing")
|
||||
else:
|
||||
(py_parent / "rules.py").absolute().write_text(
|
||||
JINJA_ENV.from_string(py_code_template).render(FUZZY_RULES=FUZZY_RULES)
|
||||
)
|
||||
print("Python rules generation completed successfully")
|
||||
|
||||
### Generate Python tests
|
||||
|
||||
py_test_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
|
||||
|
||||
import pytest
|
||||
|
||||
from warc2zim.url_rewriting import apply_fuzzy_rules
|
||||
|
||||
from .utils import ContentForTests
|
||||
|
||||
{% for rule in FUZZY_RULES %}
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
{% for test in rule['tests'] %}
|
||||
{% if test['unchanged'] %}
|
||||
ContentForTests(
|
||||
"{{ test['raw_url'] }}",
|
||||
),
|
||||
{% else %}
|
||||
ContentForTests(
|
||||
"{{ test['raw_url'] }}",
|
||||
"{{ test['fuzzified_url'] }}",
|
||||
),
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
]
|
||||
)
|
||||
def {{ rule['name'] }}_case(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_fuzzyrules_{{ rule['name'] }}({{ rule['name'] }}_case):
|
||||
assert (
|
||||
apply_fuzzy_rules({{ rule['name'] }}_case.input_str)
|
||||
== {{ rule['name'] }}_case.expected_str
|
||||
)
|
||||
{% endfor %}
|
||||
|
||||
"""
|
||||
|
||||
py_parent = Path(__file__).joinpath("../../tests").resolve()
|
||||
if not py_parent.exists():
|
||||
# This skip is usefull mostly for CI operations when working on the JS part
|
||||
print("Skipping Python tests generation, target folder is missing")
|
||||
else:
|
||||
(py_parent / "test_fuzzy_rules.py").absolute().write_text(
|
||||
JINJA_ENV.from_string(py_test_template).render(FUZZY_RULES=FUZZY_RULES)
|
||||
)
|
||||
print("Python tests generation completed successfully")
|
213
rules/rules.yaml
213
rules/rules.yaml
|
@ -1,213 +0,0 @@
|
|||
# This file comes from an adaptation of rules present in
|
||||
# https://github.com/webrecorder/wabac.js/blame/main/src/fuzzymatcher.js
|
||||
#
|
||||
# Syncing rules is done manually, based on expert knowledge, especially because in
|
||||
# warc2zim we are not really fuzzy matching (searching the best entry among existing
|
||||
# ones) but just rewriting to proper path.
|
||||
#
|
||||
# This file is in sync with content at commit 879018d5b96962df82340a9a57570bbc0fc67815
|
||||
# from June 9, 2024
|
||||
#
|
||||
# This file should be updated at every release of warc2zim
|
||||
#
|
||||
# Some rules are voluntarily missing because not been tested in warc2zim yet: Twitter,
|
||||
# Washington Post, WixStatic, Facebook
|
||||
#
|
||||
# Generic rules are also ommitted on purpose, we don't need them
|
||||
#
|
||||
fuzzyRules:
|
||||
- name: googlevideo_com
|
||||
pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).*
|
||||
replace: youtube.fuzzy.replayweb.page/\1?\2
|
||||
tests:
|
||||
- raw_url: foobargooglevideo.com/videoplayback?id=1576&key=value
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
|
||||
- raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
|
||||
- raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576&key=value
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
|
||||
- raw_url: foobargooglevideo.com/videoplaybackandfoo?some=thing&id=1576&key=value
|
||||
unchanged: true # videoplayback is not followed by `?`
|
||||
- raw_url: foobargoogle_video.com/videoplaybackandfoo?some=thing&id=1576&key=value
|
||||
unchanged: true # No googlevideo.com in url
|
||||
- name: youtube_video_info
|
||||
pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).*
|
||||
replace : youtube.fuzzy.replayweb.page/\1\2
|
||||
tests:
|
||||
- raw_url: www.youtube.com/get_video_info?video_id=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
|
||||
- raw_url: www.youtube.com/get_video_info?foo=bar&video_id=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
|
||||
- raw_url: www.youtube.com/get_video_info?video_id=123ah&foo=bar
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
|
||||
- raw_url: youtube.com/get_video_info?video_id=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
|
||||
- raw_url: youtube-nocookie.com/get_video_info?video_id=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
|
||||
- raw_url: www.youtube-nocookie.com/get_video_info?video_id=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
|
||||
- raw_url: www.youtube-nocookie.com/get_video_info?foo=bar
|
||||
unchanged: true # no video_id parameter
|
||||
- raw_url: www.youtubeqnocookie.com/get_video_info?video_id=123ah
|
||||
unchanged: true # improper hostname
|
||||
- name: youtube_thumbnails
|
||||
pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$)
|
||||
replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
|
||||
tests:
|
||||
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g
|
||||
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
|
||||
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.png?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g
|
||||
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.png
|
||||
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg
|
||||
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
|
||||
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/max-res.default.jpg
|
||||
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
|
||||
- name: trim_digits_only
|
||||
pattern: ([^?]+)\?[\d]+$
|
||||
replace : \1
|
||||
tests:
|
||||
- raw_url: www.example.com/page?1234
|
||||
fuzzified_url: www.example.com/page
|
||||
- raw_url: www.example.com/page?foo=1234
|
||||
unchanged: true
|
||||
- raw_url: www.example.com/page1234
|
||||
unchanged: true
|
||||
- raw_url: www.example.com/page?foo=bar&1234
|
||||
unchanged: true
|
||||
- raw_url: www.example.com/page?1234=bar
|
||||
unchanged: true
|
||||
- raw_url: www.example.com/page?1234&foo=bar
|
||||
unchanged: true
|
||||
- name: youtubei
|
||||
pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*
|
||||
replace : youtube.fuzzy.replayweb.page/\1?\2
|
||||
tests:
|
||||
- raw_url: www.youtube-nocookie.com/youtubei/page/?videoId=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
|
||||
- raw_url: youtube-nocookie.com/youtubei/page/?videoId=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
|
||||
- raw_url: youtube.com/youtubei/page/?videoId=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
|
||||
- raw_url: www.youtube.com/youtubei/page/?videoId=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
|
||||
- raw_url: youtube.com/youtubei/page/videoId=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
|
||||
- raw_url: youtube.com/youtubei/page/videoIdqqq=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoIdqqq=123ah
|
||||
- raw_url: youtube.com/youtubei/page/videoId=123ah&foo=bar
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
|
||||
- raw_url: youtube.com/youtubei/page/?foo=bar&videoId=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
|
||||
- raw_url: youtube.com/youtubei/page/foo=bar&videoId=123ah
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/foo=bar&?videoId=123ah
|
||||
- raw_url: youtube.com/youtubei/?videoId=123ah
|
||||
unchanged: true
|
||||
- name: youtube_embed
|
||||
pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*
|
||||
replace : youtube.fuzzy.replayweb.page/embed/\1
|
||||
tests:
|
||||
- raw_url: www.youtube-nocookie.com/embed/foo
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
|
||||
- raw_url: www.youtube-nocookie.com/embed/bar
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/embed/bar
|
||||
- raw_url: www.youtube-nocookie.com/embed/foo/bar
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo/bar
|
||||
- raw_url: www.youtube.com/embed/foo
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
|
||||
- raw_url: youtube.com/embed/foo
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
|
||||
- raw_url: youtube-nocookie.com/embed/foo
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
|
||||
- raw_url: youtube.com/embed/foo?bar=alice
|
||||
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
|
||||
|
||||
- name: vimeo_cdn_fix # custom warc2zim rule intended to fix Vimeo support
|
||||
pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*\/(.+?.mp4)\?.*range=(.*?)(?:&.*|$)
|
||||
replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
|
||||
tests:
|
||||
- raw_url: gcs-vimeo.akamaized.net/123.mp4?range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
|
||||
- raw_url: vod.akamaized.net/123.mp4?range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
|
||||
- raw_url: vod-progressive.akamaized.net/123.mp4?range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
|
||||
- raw_url: vod-adaptive.akamaized.net/123.mp4?range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
|
||||
- raw_url: vod.akamaized.net/123.mp4?foo=bar&range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
|
||||
- raw_url: vod.akamaized.net/123.mp4?foo=bar&range=123-456&bar=foo
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
|
||||
- raw_url: vod.akamaized.net/123.mp4?range=123-456&bar=foo
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
|
||||
- raw_url: foovod.akamaized.net/123.mp4?range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
|
||||
- raw_url: vod.akamaized.net/1/23.mp4?range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
|
||||
- raw_url: vod.akamaized.net/a/23.mp4?range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
|
||||
- raw_url: vod.akamaized.net/foo/bar/23.mp4?range=123-456
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
|
||||
- raw_url: foo.akamaized.net/123.mp4?range=123-456
|
||||
unchanged: true
|
||||
- name: vimeo_cdn
|
||||
pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?\/([\d/]+.mp4)$
|
||||
replace : vimeo-cdn.fuzzy.replayweb.page/\1
|
||||
tests:
|
||||
- raw_url: vod.akamaized.net/23.mp4
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4
|
||||
- raw_url: vod.akamaized.net/23/12332.mp4
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23/12332.mp4
|
||||
- raw_url: https://vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
|
||||
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
|
||||
- name: vimeo_player
|
||||
pattern: .*player.vimeo.com\/(video\/[\d]+)\?.*
|
||||
replace : vimeo.fuzzy.replayweb.page/\1
|
||||
tests:
|
||||
- raw_url: player.vimeo.com/video/1234?foo=bar
|
||||
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
|
||||
- raw_url: foo.player.vimeo.com/video/1234?foo=bar
|
||||
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
|
||||
- raw_url: player.vimeo.com/video/1234?foo
|
||||
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
|
||||
- raw_url: player.vimeo.com/video/1/23?foo=bar
|
||||
unchanged: true
|
||||
- raw_url: player.vimeo.com/video/123a?foo=bar
|
||||
unchanged: true
|
||||
- raw_url: player.vimeo.com/video/?foo=bar
|
||||
unchanged: true
|
||||
- name: i_vimeo_cdn
|
||||
pattern: .*i\.vimeocdn\.com\/(.*)\?.*
|
||||
replace : i.vimeocdn.fuzzy.replayweb.page/\1
|
||||
tests:
|
||||
- raw_url: i.vimeocdn.com/image/1234?foo=bar
|
||||
fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/image/1234
|
||||
- raw_url: i.vimeocdn.com/something/a456?foo
|
||||
fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/something/a456
|
||||
- name: cheatography_com
|
||||
pattern: cheatography\.com\/scripts\/(.*).js.*[?&](v=[^&]+).*
|
||||
replace : cheatography.com.fuzzy.replayweb.page/scripts/\1.js?\2
|
||||
tests:
|
||||
- raw_url: cheatography.com/scripts/useful.min.js?v=2&q=1719438924
|
||||
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/useful.min.js?v=2
|
||||
- raw_url: cheatography.com/scripts/foo.js?v=2&q=1719438924
|
||||
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/foo.js?v=2
|
||||
- raw_url: cheatography.com/scripts/useful.min.js?q=1719438924&v=2
|
||||
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/useful.min.js?v=2
|
||||
- raw_url: cheatography.com/scripts/useful.min.js?q=1719438924&v=2&foo=bar
|
||||
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/useful.min.js?v=2
|
||||
- name: der_postillon_com
|
||||
pattern: blogger.googleusercontent.com\/img\/(.*\.jpg)=.*
|
||||
replace: blogger.googleusercontent.com.fuzzy.replayweb.page/img/\1.resized
|
||||
tests:
|
||||
- raw_url: blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjlN4LY6kFVwL8-rinDWp3kJp1TowOVD8vq8TP8nl3Lf1sI-hx0DE1GQA1jw7DT7XvK3FjghzJ17_1pvyXyDBAV0vtigJRnFCNfMxnndBnN3NYoXUvKQQsQ7JTGXOSajdo0mNQIv8wss_AxPBMrR4-Dd_EEacV7ZMS3m_IL2dz0WsbbKn7FD7ntsfOe0JUq/s600-rw/tickerzugtier2.jpg=w487-h220-p-k-no-nu
|
||||
fuzzified_url: blogger.googleusercontent.com.fuzzy.replayweb.page/img/b/R29vZ2xl/AVvXsEjlN4LY6kFVwL8-rinDWp3kJp1TowOVD8vq8TP8nl3Lf1sI-hx0DE1GQA1jw7DT7XvK3FjghzJ17_1pvyXyDBAV0vtigJRnFCNfMxnndBnN3NYoXUvKQQsQ7JTGXOSajdo0mNQIv8wss_AxPBMrR4-Dd_EEacV7ZMS3m_IL2dz0WsbbKn7FD7ntsfOe0JUq/s600-rw/tickerzugtier2.jpg.resized
|
||||
- raw_url: blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjlN4LY6kFVwL8-rinDWp3kJp1TowOVD8vq8TP8nl3Lf1sI-hx0DE1GQA1jw7DT7XvK3FjghzJ17_1pvyXyDBAV0vtigJRnFCNfMxnndBnN3NYoXUvKQQsQ7JTGXOSajdo0mNQIv8wss_AxPBMrR4-Dd_EEacV7ZMS3m_IL2dz0WsbbKn7FD7ntsfOe0JUq/w72-h72-p-k-no-nu/tickerzugtier2.jpg
|
||||
unchanged: true
|
||||
- name: iranwire_com
|
||||
pattern: (iranwire\.com\/questions\/detail\/.*)\?.*
|
||||
replace: \1
|
||||
tests:
|
||||
- raw_url: iranwire.com/questions/detail/1723?&_=1721804954220
|
||||
fuzzified_url: iranwire.com/questions/detail/1723
|
||||
- raw_url: iranwire.com/questions/detail/1725?foo=bar&_=1721804454220
|
||||
fuzzified_url: iranwire.com/questions/detail/1725
|
|
@ -1,115 +0,0 @@
|
|||
import re
|
||||
from collections.abc import Iterable
|
||||
|
||||
from tinycss2 import (
|
||||
ast,
|
||||
parse_declaration_list,
|
||||
parse_stylesheet,
|
||||
parse_stylesheet_bytes,
|
||||
serialize,
|
||||
)
|
||||
from tinycss2.serializer import serialize_url
|
||||
|
||||
from warc2zim.constants import logger
|
||||
from warc2zim.content_rewriting.rx_replacer import RxRewriter
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter
|
||||
|
||||
|
||||
class FallbackRegexCssRewriter(RxRewriter):
|
||||
def __init__(self, url_rewriter: ArticleUrlRewriter, base_href: str | None):
|
||||
rules = [
|
||||
(
|
||||
re.compile(r"""url\((?P<quote>['"])?(?P<url>.+?)(?P=quote)(?<!\\)\)"""),
|
||||
lambda m_object, _opts: "".join(
|
||||
[
|
||||
"url(",
|
||||
m_object["quote"],
|
||||
url_rewriter(m_object["url"], base_href),
|
||||
m_object["quote"],
|
||||
")",
|
||||
]
|
||||
),
|
||||
)
|
||||
]
|
||||
super().__init__(rules)
|
||||
|
||||
|
||||
class CssRewriter:
|
||||
def __init__(self, url_rewriter: ArticleUrlRewriter, base_href: str | None):
|
||||
self.url_rewriter = url_rewriter
|
||||
self.base_href = base_href
|
||||
self.fallback_rewriter = FallbackRegexCssRewriter(url_rewriter, base_href)
|
||||
|
||||
def rewrite(self, content: str | bytes) -> str:
|
||||
try:
|
||||
if isinstance(content, bytes):
|
||||
rules = parse_stylesheet_bytes(content)[0]
|
||||
else:
|
||||
rules = parse_stylesheet(content)
|
||||
self.process_list(rules)
|
||||
|
||||
output = serialize(rules)
|
||||
except Exception:
|
||||
# If tinycss fail to parse css, it will generate a "Error" token.
|
||||
# Exception is raised at serialization time.
|
||||
# We try/catch the whole process to be sure anyway.
|
||||
logger.warning(
|
||||
(
|
||||
"Css transformation fails. Fallback to regex rewriter.\n"
|
||||
"Article path is %s"
|
||||
),
|
||||
self.url_rewriter.article_url,
|
||||
)
|
||||
return self.fallback_rewriter.rewrite(content, {})
|
||||
return output
|
||||
|
||||
def rewrite_inline(self, content: str) -> str:
|
||||
try:
|
||||
rules = parse_declaration_list(content)
|
||||
self.process_list(rules)
|
||||
output = serialize(rules)
|
||||
return output
|
||||
except Exception:
|
||||
# If tinycss fail to parse css, it will generate a "Error" token.
|
||||
# Exception is raised at serialization time.
|
||||
# We try/catch the whole process to be sure anyway.
|
||||
logger.warning(
|
||||
(
|
||||
"Css transformation fails. Fallback to regex rewriter.\n"
|
||||
"Content is `%s`"
|
||||
),
|
||||
content,
|
||||
)
|
||||
return self.fallback_rewriter.rewrite(content, {})
|
||||
|
||||
def process_list(self, components: Iterable[ast.Node]):
|
||||
if components: # May be null
|
||||
for component in components:
|
||||
self.process(component)
|
||||
|
||||
def process(self, component: ast.Node):
|
||||
if isinstance(
|
||||
component,
|
||||
ast.QualifiedRule
|
||||
| ast.SquareBracketsBlock
|
||||
| ast.ParenthesesBlock
|
||||
| ast.CurlyBracketsBlock,
|
||||
):
|
||||
self.process_list(component.content)
|
||||
elif isinstance(component, ast.FunctionBlock):
|
||||
if component.lower_name == "url":
|
||||
url_component = component.arguments[0]
|
||||
new_url = self.url_rewriter(url_component.value, self.base_href)
|
||||
url_component.value = new_url
|
||||
url_component.representation = f'"{serialize_url(new_url)}"'
|
||||
else:
|
||||
self.process_list(component.arguments)
|
||||
elif isinstance(component, ast.AtRule):
|
||||
self.process_list(component.prelude)
|
||||
self.process_list(component.content)
|
||||
elif isinstance(component, ast.Declaration):
|
||||
self.process_list(component.value)
|
||||
elif isinstance(component, ast.URLToken):
|
||||
new_url = self.url_rewriter(component.value, self.base_href)
|
||||
component.value = new_url
|
||||
component.representation = f"url({serialize_url(new_url)})"
|
|
@ -1,665 +0,0 @@
|
|||
import io
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from functools import cache
|
||||
from html import escape
|
||||
from html.parser import HTMLParser
|
||||
from inspect import Signature, signature
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from warc2zim.content_rewriting.css import CssRewriter
|
||||
from warc2zim.content_rewriting.js import JsRewriter
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter, ZimPath
|
||||
|
||||
AttrNameAndValue = tuple[str, str | None]
|
||||
AttrsList = list[AttrNameAndValue]
|
||||
|
||||
RewritenHtml = namedtuple("RewritenHmtl", ["title", "content"])
|
||||
|
||||
HTTP_EQUIV_REDIRECT_RE = re.compile(
|
||||
r"^\s*(?P<interval>.*?)\s*;\s*url\s*=\s*(?P<url>.*?)\s*$"
|
||||
)
|
||||
|
||||
|
||||
def get_attr_value_from(
|
||||
attrs: AttrsList, name: str, default: str | None = None
|
||||
) -> str | None:
|
||||
"""Get one HTML attribute value if present, else return default value"""
|
||||
for attr_name, value in attrs:
|
||||
if attr_name == name:
|
||||
return value
|
||||
return default
|
||||
|
||||
|
||||
def format_attr(name: str, value: str | None) -> str:
|
||||
"""Format a given attribute name and value, properly escaping the value"""
|
||||
if value is None:
|
||||
return name
|
||||
html_escaped_value = escape(value, quote=True)
|
||||
return f'{name}="{html_escaped_value}"'
|
||||
|
||||
|
||||
def get_html_rewrite_context(tag: str, attrs: AttrsList) -> str:
|
||||
"""Get current HTML rewrite context
|
||||
|
||||
By default, rewrite context is the HTML tag. But in some cases (e.g. script tags) we
|
||||
need to be more precise since rewriting logic will vary based on another attribute
|
||||
value (e.g. type attribute for script tags)
|
||||
"""
|
||||
if tag == "script":
|
||||
script_type = get_attr_value_from(attrs, "type")
|
||||
return {
|
||||
"application/json": "json",
|
||||
"json": "json",
|
||||
"module": "js-module",
|
||||
"application/javascript": "js-classic",
|
||||
"text/javascript": "js-classic",
|
||||
"": "js-classic",
|
||||
}.get(script_type or "", "unknown")
|
||||
elif tag == "link":
|
||||
link_rel = get_attr_value_from(attrs, "rel")
|
||||
if link_rel == "modulepreload":
|
||||
return "js-module"
|
||||
elif link_rel == "preload":
|
||||
preload_type = get_attr_value_from(attrs, "as")
|
||||
if preload_type == "script":
|
||||
return "js-classic"
|
||||
return tag
|
||||
|
||||
|
||||
def extract_base_href(content: str) -> str | None:
|
||||
"""Extract base href value from HTML content
|
||||
|
||||
This is done in a specific function before real parsing / rewriting of any HTML
|
||||
because we need this information before rewriting any link since we might have stuff
|
||||
before the <base> tag in html head (e.g. <link> for favicons)
|
||||
"""
|
||||
soup = BeautifulSoup(content, features="lxml")
|
||||
if not soup.head:
|
||||
return None
|
||||
for base in soup.head.find_all("base"):
|
||||
if base.has_attr("href"):
|
||||
return base["href"]
|
||||
return None
|
||||
|
||||
|
||||
@cache
|
||||
def _cached_signature(func: Callable) -> Signature:
|
||||
"""Returns the signature of a given callable
|
||||
|
||||
Result is cached to save performance when reused multiple times
|
||||
"""
|
||||
return signature(func)
|
||||
|
||||
|
||||
class HtmlRewriter(HTMLParser):
|
||||
def __init__(
|
||||
self,
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
pre_head_insert: str,
|
||||
post_head_insert: str | None,
|
||||
notify_js_module: Callable[[ZimPath], None],
|
||||
):
|
||||
super().__init__(convert_charrefs=False)
|
||||
self.url_rewriter = url_rewriter
|
||||
self.title = None
|
||||
self.output = None
|
||||
# This works only for tag without children.
|
||||
# But as we use it to get the title, we are ok
|
||||
self.html_rewrite_context = None
|
||||
self.pre_head_insert = pre_head_insert
|
||||
self.post_head_insert = post_head_insert
|
||||
self.notify_js_module = notify_js_module
|
||||
|
||||
def rewrite(self, content: str) -> RewritenHtml:
|
||||
if self.output is not None:
|
||||
raise Exception("ouput should not already be set") # pragma: no cover
|
||||
self.output = io.StringIO()
|
||||
|
||||
self.base_href = extract_base_href(content)
|
||||
self.css_rewriter = CssRewriter(self.url_rewriter, self.base_href)
|
||||
self.js_rewriter = JsRewriter(
|
||||
url_rewriter=self.url_rewriter,
|
||||
base_href=self.base_href,
|
||||
notify_js_module=self.notify_js_module,
|
||||
)
|
||||
|
||||
self.feed(content)
|
||||
self.close()
|
||||
|
||||
output = self.output.getvalue()
|
||||
self.output = None
|
||||
return RewritenHtml(self.title or "", output)
|
||||
|
||||
def send(self, value: str):
|
||||
self.output.write(value) # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: AttrsList, *, auto_close: bool = False):
|
||||
self.html_rewrite_context = get_html_rewrite_context(tag=tag, attrs=attrs)
|
||||
|
||||
if (
|
||||
rewritten := rules._do_tag_rewrite(
|
||||
tag=tag, attrs=attrs, auto_close=auto_close
|
||||
)
|
||||
) is not None:
|
||||
self.send(rewritten)
|
||||
return
|
||||
|
||||
self.send(f"<{tag}")
|
||||
if attrs:
|
||||
self.send(" ")
|
||||
self.send(
|
||||
" ".join(
|
||||
format_attr(*attr)
|
||||
for attr in (
|
||||
rules._do_attribute_rewrite(
|
||||
tag=tag,
|
||||
attr_name=attr_name,
|
||||
attr_value=attr_value,
|
||||
attrs=attrs,
|
||||
js_rewriter=self.js_rewriter,
|
||||
css_rewriter=self.css_rewriter,
|
||||
url_rewriter=self.url_rewriter,
|
||||
base_href=self.base_href,
|
||||
notify_js_module=self.notify_js_module,
|
||||
)
|
||||
for attr_name, attr_value in attrs
|
||||
if not rules._do_drop_attribute(
|
||||
tag=tag, attr_name=attr_name, attr_value=attr_value, attrs=attrs
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if auto_close:
|
||||
self.send(" />")
|
||||
else:
|
||||
self.send(">")
|
||||
if tag == "head" and self.pre_head_insert:
|
||||
self.send(self.pre_head_insert)
|
||||
|
||||
def handle_endtag(self, tag: str):
|
||||
self.html_rewrite_context = None
|
||||
if tag == "head" and self.post_head_insert:
|
||||
self.send(self.post_head_insert)
|
||||
self.send(f"</{tag}>")
|
||||
|
||||
def handle_startendtag(self, tag: str, attrs: AttrsList):
|
||||
self.handle_starttag(tag, attrs, auto_close=True)
|
||||
self.html_rewrite_context = None
|
||||
|
||||
def handle_data(self, data: str):
|
||||
if self.html_rewrite_context == "title" and self.title is None:
|
||||
self.title = data.strip()
|
||||
if (
|
||||
data.strip()
|
||||
and (
|
||||
rewritten := rules._do_data_rewrite(
|
||||
html_rewrite_context=self.html_rewrite_context,
|
||||
data=data,
|
||||
css_rewriter=self.css_rewriter,
|
||||
js_rewriter=self.js_rewriter,
|
||||
url_rewriter=self.url_rewriter,
|
||||
)
|
||||
)
|
||||
is not None
|
||||
):
|
||||
self.send(rewritten)
|
||||
return
|
||||
self.send(data)
|
||||
|
||||
def handle_entityref(self, name: str):
|
||||
self.send(f"&{name};")
|
||||
|
||||
def handle_charref(self, name: str):
|
||||
self.send(f"&#{name};")
|
||||
|
||||
def handle_comment(self, data: str):
|
||||
self.send(f"<!--{data}-->")
|
||||
|
||||
def handle_decl(self, decl: str):
|
||||
self.send(f"<!{decl}>")
|
||||
|
||||
def handle_pi(self, data: str):
|
||||
self.send(f"<?{data}>")
|
||||
|
||||
def unknown_decl(self, data: str):
|
||||
self.handle_decl(data)
|
||||
|
||||
|
||||
DropAttributeCallable = Callable[..., bool]
|
||||
RewriteAttributeCallable = Callable[..., AttrNameAndValue | None]
|
||||
RewriteTagCallable = Callable[..., str | None]
|
||||
RewriteDataCallable = Callable[..., str | None]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DropAttributeRule:
|
||||
"""A rule specifying when an HTML attribute should be dropped"""
|
||||
|
||||
func: DropAttributeCallable
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RewriteAttributeRule:
|
||||
"""A rule specifying how a given HTML attribute should be rewritten"""
|
||||
|
||||
func: RewriteAttributeCallable
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RewriteTagRule:
|
||||
"""A rule specifying how a given HTML tag should be rewritten"""
|
||||
|
||||
func: RewriteTagCallable
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RewriteDataRule:
|
||||
"""A rule specifying how a given HTML data should be rewritten"""
|
||||
|
||||
func: RewriteDataCallable
|
||||
|
||||
|
||||
def _check_decorated_func_signature(expected_func: Callable, decorated_func: Callable):
|
||||
"""Checks if the decorated function signature is compatible
|
||||
|
||||
It checks that decorated function parameters have known names and proper types
|
||||
"""
|
||||
expected_params = _cached_signature(expected_func).parameters
|
||||
func_params = _cached_signature(decorated_func).parameters
|
||||
for name, param in func_params.items():
|
||||
if name not in expected_params:
|
||||
raise TypeError(
|
||||
f"Parameter '{name}' is unsupported in function "
|
||||
f"'{decorated_func.__name__}'"
|
||||
)
|
||||
|
||||
if expected_params[name].annotation != param.annotation:
|
||||
raise TypeError(
|
||||
f"Parameter '{name}' in function '{decorated_func.__name__}' must be of"
|
||||
f" type '{expected_params[name].annotation}'"
|
||||
)
|
||||
|
||||
|
||||
class HTMLRewritingRules:
|
||||
"""A class holding the definitions of all rules to rewrite HTML documents"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.drop_attribute_rules: set[DropAttributeRule] = set()
|
||||
self.rewrite_attribute_rules: set[RewriteAttributeRule] = set()
|
||||
self.rewrite_tag_rules: set[RewriteTagRule] = set()
|
||||
self.rewrite_data_rules: set[RewriteDataRule] = set()
|
||||
|
||||
def drop_attribute(
|
||||
self,
|
||||
) -> Callable[[DropAttributeCallable], DropAttributeCallable]:
|
||||
"""Decorator to use when defining a rule regarding attribute dropping"""
|
||||
|
||||
def decorator(func: DropAttributeCallable) -> DropAttributeCallable:
|
||||
_check_decorated_func_signature(self._do_drop_attribute, func)
|
||||
self.drop_attribute_rules.add(DropAttributeRule(func=func))
|
||||
return func
|
||||
|
||||
return decorator
|
||||
|
||||
def rewrite_attribute(
|
||||
self,
|
||||
) -> Callable[[RewriteAttributeCallable], RewriteAttributeCallable]:
|
||||
"""Decorator to use when defining a rule regarding attribute rewriting"""
|
||||
|
||||
def decorator(func: RewriteAttributeCallable) -> RewriteAttributeCallable:
|
||||
_check_decorated_func_signature(self._do_attribute_rewrite, func)
|
||||
self.rewrite_attribute_rules.add(RewriteAttributeRule(func=func))
|
||||
return func
|
||||
|
||||
return decorator
|
||||
|
||||
def rewrite_tag(
|
||||
self,
|
||||
) -> Callable[[RewriteTagCallable], RewriteTagCallable]:
|
||||
"""Decorator to use when defining a rule regarding tag rewriting
|
||||
|
||||
This has to be used when we need to rewrite the whole start tag. It can also
|
||||
handle rewrites of startend tags (autoclosing tags).
|
||||
"""
|
||||
|
||||
def decorator(func: RewriteTagCallable) -> RewriteTagCallable:
|
||||
_check_decorated_func_signature(self._do_tag_rewrite, func)
|
||||
self.rewrite_tag_rules.add(RewriteTagRule(func=func))
|
||||
return func
|
||||
|
||||
return decorator
|
||||
|
||||
def rewrite_data(
|
||||
self,
|
||||
) -> Callable[[RewriteDataCallable], RewriteDataCallable]:
|
||||
"""Decorator to use when defining a rule regarding data rewriting
|
||||
|
||||
This has to be used when we need to rewrite the tag data.
|
||||
"""
|
||||
|
||||
def decorator(func: RewriteDataCallable) -> RewriteDataCallable:
|
||||
_check_decorated_func_signature(self._do_data_rewrite, func)
|
||||
self.rewrite_data_rules.add(RewriteDataRule(func=func))
|
||||
return func
|
||||
|
||||
return decorator
|
||||
|
||||
def _do_drop_attribute(
|
||||
self, tag: str, attr_name: str, attr_value: str | None, attrs: AttrsList
|
||||
) -> bool:
|
||||
"""Utility function to process all attribute dropping rules
|
||||
|
||||
Returns true if at least one rule is matching
|
||||
"""
|
||||
return any(
|
||||
rule.func(
|
||||
**{
|
||||
arg_name: arg_value
|
||||
for arg_name, arg_value in {
|
||||
"tag": tag,
|
||||
"attr_name": attr_name,
|
||||
"attr_value": attr_value,
|
||||
"attrs": attrs,
|
||||
}.items()
|
||||
if arg_name in _cached_signature(rule.func).parameters
|
||||
}
|
||||
)
|
||||
is True
|
||||
for rule in self.drop_attribute_rules
|
||||
)
|
||||
|
||||
def _do_attribute_rewrite(
|
||||
self,
|
||||
tag: str,
|
||||
attr_name: str,
|
||||
attr_value: str | None,
|
||||
attrs: AttrsList,
|
||||
js_rewriter: JsRewriter,
|
||||
css_rewriter: CssRewriter,
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
base_href: str | None,
|
||||
notify_js_module: Callable[[ZimPath], None],
|
||||
) -> AttrNameAndValue:
|
||||
"""Utility function to process all attribute rewriting rules
|
||||
|
||||
Returns the rewritten attribute name and value
|
||||
"""
|
||||
|
||||
if attr_value is None:
|
||||
return attr_name, None
|
||||
|
||||
for rule in self.rewrite_attribute_rules:
|
||||
if (
|
||||
rewritten := rule.func(
|
||||
**{
|
||||
arg_name: arg_value
|
||||
for arg_name, arg_value in {
|
||||
"tag": tag,
|
||||
"attr_name": attr_name,
|
||||
"attr_value": attr_value,
|
||||
"attrs": attrs,
|
||||
"js_rewriter": js_rewriter,
|
||||
"css_rewriter": css_rewriter,
|
||||
"url_rewriter": url_rewriter,
|
||||
"base_href": base_href,
|
||||
"notify_js_module": notify_js_module,
|
||||
}.items()
|
||||
if arg_name in _cached_signature(rule.func).parameters
|
||||
}
|
||||
)
|
||||
) is not None:
|
||||
attr_name, attr_value = rewritten
|
||||
|
||||
return attr_name, attr_value
|
||||
|
||||
def _do_tag_rewrite(
|
||||
self,
|
||||
tag: str,
|
||||
attrs: AttrsList,
|
||||
*,
|
||||
auto_close: bool,
|
||||
) -> str | None:
|
||||
"""Utility function to process all tag rewriting rules
|
||||
|
||||
Returns the rewritten tag
|
||||
"""
|
||||
|
||||
for rule in self.rewrite_tag_rules:
|
||||
if (
|
||||
rewritten := rule.func(
|
||||
**{
|
||||
arg_name: arg_value
|
||||
for arg_name, arg_value in {
|
||||
"tag": tag,
|
||||
"attrs": attrs,
|
||||
"auto_close": auto_close,
|
||||
}.items()
|
||||
if arg_name in _cached_signature(rule.func).parameters
|
||||
}
|
||||
)
|
||||
) is not None:
|
||||
return rewritten
|
||||
|
||||
def _do_data_rewrite(
|
||||
self,
|
||||
html_rewrite_context: str | None,
|
||||
data: str,
|
||||
css_rewriter: CssRewriter,
|
||||
js_rewriter: JsRewriter,
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
) -> str | None:
|
||||
"""Utility function to process all data rewriting rules
|
||||
|
||||
Returns the rewritten data
|
||||
"""
|
||||
|
||||
for rule in self.rewrite_data_rules:
|
||||
if (
|
||||
rewritten := rule.func(
|
||||
**{
|
||||
arg_name: arg_value
|
||||
for arg_name, arg_value in {
|
||||
"html_rewrite_context": html_rewrite_context,
|
||||
"data": data,
|
||||
"css_rewriter": css_rewriter,
|
||||
"js_rewriter": js_rewriter,
|
||||
"url_rewriter": url_rewriter,
|
||||
}.items()
|
||||
if arg_name in _cached_signature(rule.func).parameters
|
||||
}
|
||||
)
|
||||
) is not None:
|
||||
return rewritten
|
||||
|
||||
|
||||
rules = HTMLRewritingRules()
|
||||
|
||||
|
||||
@rules.drop_attribute()
|
||||
def drop_script_integrity_attribute(tag: str, attr_name: str):
|
||||
"""Drop integrity attribute in <script> tags"""
|
||||
return tag == "script" and attr_name == "integrity"
|
||||
|
||||
|
||||
@rules.drop_attribute()
|
||||
def drop_link_integrity_attribute(tag: str, attr_name: str):
|
||||
"""Drop integrity attribute in <link> tags"""
|
||||
return tag == "link" and attr_name == "integrity"
|
||||
|
||||
|
||||
@rules.rewrite_attribute()
|
||||
def rewrite_meta_charset_content(
|
||||
tag: str, attr_name: str, attrs: AttrsList
|
||||
) -> AttrNameAndValue | None:
|
||||
"""Rewrite charset indicated in meta tag
|
||||
|
||||
We need to rewrite both <meta charset='xxx'> and
|
||||
<meta http-equiv='content-type' content='text/html; charset=xxx'>
|
||||
"""
|
||||
if tag != "meta":
|
||||
return
|
||||
if attr_name == "charset":
|
||||
return (attr_name, "UTF-8")
|
||||
if attr_name == "content" and any(
|
||||
attr_name.lower() == "http-equiv"
|
||||
and attr_value
|
||||
and attr_value.lower() == "content-type"
|
||||
for attr_name, attr_value in attrs
|
||||
):
|
||||
return (attr_name, "text/html; charset=UTF-8")
|
||||
|
||||
|
||||
@rules.rewrite_attribute()
|
||||
def rewrite_onxxx_tags(
|
||||
attr_name: str, attr_value: str | None, js_rewriter: JsRewriter
|
||||
) -> AttrNameAndValue | None:
|
||||
"""Rewrite onxxx script attributes"""
|
||||
if attr_value and attr_name.startswith("on") and not attr_name.startswith("on-"):
|
||||
return (attr_name, js_rewriter.rewrite(attr_value))
|
||||
|
||||
|
||||
@rules.rewrite_attribute()
|
||||
def rewrite_style_tags(
|
||||
attr_name: str, attr_value: str | None, css_rewriter: CssRewriter
|
||||
) -> AttrNameAndValue | None:
|
||||
"""Rewrite style attributes"""
|
||||
if attr_value and attr_name == "style":
|
||||
return (attr_name, css_rewriter.rewrite_inline(attr_value))
|
||||
|
||||
|
||||
@rules.rewrite_attribute()
|
||||
def rewrite_href_src_attributes(
|
||||
tag: str,
|
||||
attr_name: str,
|
||||
attr_value: str | None,
|
||||
attrs: AttrsList,
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
base_href: str | None,
|
||||
notify_js_module: Callable[[ZimPath], None],
|
||||
):
|
||||
"""Rewrite href and src attributes
|
||||
|
||||
This is also notifying of any JS script found used as a module, so that this script
|
||||
is properly rewritten when encountered later on.
|
||||
"""
|
||||
if attr_name not in ("href", "src") or not attr_value:
|
||||
return
|
||||
if get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module":
|
||||
notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
|
||||
return (
|
||||
attr_name,
|
||||
url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
|
||||
)
|
||||
|
||||
|
||||
@rules.rewrite_attribute()
|
||||
def rewrite_srcset_attribute(
|
||||
attr_name: str,
|
||||
attr_value: str | None,
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
base_href: str | None,
|
||||
):
|
||||
"""Rewrite srcset attributes"""
|
||||
if attr_name != "srcset" or not attr_value:
|
||||
return
|
||||
value_list = attr_value.split(",")
|
||||
new_value_list = []
|
||||
for value in value_list:
|
||||
url, *other = value.strip().split(" ", maxsplit=1)
|
||||
new_url = url_rewriter(url, base_href=base_href)
|
||||
new_value = " ".join([new_url, *other])
|
||||
new_value_list.append(new_value)
|
||||
return (attr_name, ", ".join(new_value_list))
|
||||
|
||||
|
||||
@rules.rewrite_tag()
|
||||
def rewrite_base_tag(tag: str, attrs: AttrsList, *, auto_close: bool):
|
||||
"""Handle special case of <base> tag which have to be simplified (remove href)
|
||||
|
||||
This is special because resulting tag might be empty and hence needs to be dropped
|
||||
"""
|
||||
if tag != "base":
|
||||
return
|
||||
if get_attr_value_from(attrs, "href") is None:
|
||||
return # needed so that other rules will be processed as well
|
||||
values = " ".join(
|
||||
format_attr(*attr)
|
||||
for attr in [
|
||||
(attr_name, attr_value)
|
||||
for (attr_name, attr_value) in attrs
|
||||
if attr_name != "href"
|
||||
]
|
||||
)
|
||||
if values:
|
||||
return f"<base {values}{'/>' if auto_close else '>'}"
|
||||
else:
|
||||
return "" # drop whole tag
|
||||
|
||||
|
||||
@rules.rewrite_data()
|
||||
def rewrite_css_data(
|
||||
html_rewrite_context: str | None, data: str, css_rewriter: CssRewriter
|
||||
) -> str | None:
|
||||
"""Rewrite inline CSS"""
|
||||
if html_rewrite_context != "style":
|
||||
return
|
||||
return css_rewriter.rewrite(data)
|
||||
|
||||
|
||||
@rules.rewrite_data()
|
||||
def rewrite_json_data(
|
||||
html_rewrite_context: str | None,
|
||||
) -> str | None:
|
||||
"""Rewrite inline JSON"""
|
||||
if html_rewrite_context != "json":
|
||||
return
|
||||
# we do not have any JSON rewriting left ATM since all these rules are applied in
|
||||
# Browsertrix crawler before storing the WARC record for now
|
||||
return
|
||||
|
||||
|
||||
@rules.rewrite_data()
|
||||
def rewrite_js_data(
|
||||
html_rewrite_context: str | None,
|
||||
data: str,
|
||||
js_rewriter: JsRewriter,
|
||||
) -> str | None:
|
||||
"""Rewrite inline JS"""
|
||||
if not (html_rewrite_context and html_rewrite_context.startswith("js-")):
|
||||
return
|
||||
return js_rewriter.rewrite(
|
||||
data,
|
||||
opts={"isModule": html_rewrite_context == "js-module"},
|
||||
)
|
||||
|
||||
|
||||
@rules.rewrite_attribute()
|
||||
def rewrite_meta_http_equiv_redirect(
|
||||
tag: str,
|
||||
attr_name: str,
|
||||
attr_value: str | None,
|
||||
attrs: AttrsList,
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
base_href: str | None,
|
||||
) -> AttrNameAndValue | None:
|
||||
"""Rewrite redirect URL in meta http-equiv refresh"""
|
||||
if tag != "meta":
|
||||
return
|
||||
if attr_name != "content":
|
||||
return
|
||||
if not attr_value:
|
||||
return
|
||||
http_equiv = get_attr_value_from(attrs, "http-equiv")
|
||||
if http_equiv != "refresh":
|
||||
return
|
||||
if (match := HTTP_EQUIV_REDIRECT_RE.match(attr_value)) is None:
|
||||
return
|
||||
return (
|
||||
attr_name,
|
||||
f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
|
||||
)
|
|
@ -1,293 +0,0 @@
|
|||
import re
|
||||
from collections.abc import Callable, Iterable
|
||||
from typing import Any
|
||||
|
||||
from warc2zim.content_rewriting.rx_replacer import (
|
||||
RxRewriter,
|
||||
TransformationAction,
|
||||
TransformationRule,
|
||||
add_prefix,
|
||||
m2str,
|
||||
replace,
|
||||
replace_prefix_from,
|
||||
)
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter, ZimPath
|
||||
|
||||
# The regex used to rewrite `import ...` in module code.
|
||||
IMPORT_MATCH_RX = re.compile(
|
||||
r"""^\s*?import(?:['"\s]*(?:[\w*${}\s,]+from\s*)?['"\s]?['"\s])(?:.*?)['"\s]""",
|
||||
)
|
||||
|
||||
# A sub regex used inside `import ...` rewrite to rewrite http url imported
|
||||
IMPORT_HTTP_RX = re.compile(
|
||||
r"""(import(?:['"\s]*(?:[\w*${}\s,]+from\s*)?['"\s]?['"\s]))((?:https?|[./]).*?)(['"\s])""",
|
||||
)
|
||||
|
||||
# This list of global variables we want to wrap.
|
||||
# We will setup the wrap only if the js script use them.
|
||||
GLOBAL_OVERRIDES = [
|
||||
"window",
|
||||
"globalThis",
|
||||
"self",
|
||||
"document",
|
||||
"location",
|
||||
"top",
|
||||
"parent",
|
||||
"frames",
|
||||
"opener",
|
||||
]
|
||||
|
||||
GLOBALS_RX = re.compile(
|
||||
r"("
|
||||
+ "|".join([r"(?:^|[^$.])\b" + x + r"\b(?:$|[^$])" for x in GLOBAL_OVERRIDES])
|
||||
+ ")"
|
||||
)
|
||||
|
||||
# This will replace `this` in code. The `_____WB$wombat$check$this$function_____`
|
||||
# will "see" with wombat and may return a "wrapper" around `this`
|
||||
this_rw = "_____WB$wombat$check$this$function_____(this)"
|
||||
|
||||
|
||||
def add_suffix_non_prop(suffix) -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function which add a `suffix` to the match str.
|
||||
The suffix is added only if the match is not preceded by `.` or `$`.
|
||||
"""
|
||||
|
||||
def f(m_object, _opts):
|
||||
offset = m_object.start()
|
||||
if offset > 0 and m_object.string[offset - 1] in ".$":
|
||||
return m_object[0]
|
||||
return m_object[0] + suffix
|
||||
|
||||
return f
|
||||
|
||||
|
||||
def replace_this() -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function replacing "this" by `this_rw` in the matching str.
|
||||
"""
|
||||
return replace("this", this_rw)
|
||||
|
||||
|
||||
def replace_this_non_prop() -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function replacing "this" by `this_rw`.
|
||||
|
||||
Replacement happen only if "this" is not a property of an object.
|
||||
"""
|
||||
|
||||
def f(m_object, _opts):
|
||||
offset = m_object.start()
|
||||
prev = m_object.string[offset - 1] if offset > 0 else ""
|
||||
if prev == "\n":
|
||||
return m_object[0].replace("this", ";" + this_rw)
|
||||
if prev not in ".$":
|
||||
return m_object[0].replace("this", this_rw)
|
||||
return m_object[0]
|
||||
|
||||
return f
|
||||
|
||||
|
||||
def replace_import(src, target) -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function replacing `src` by `target` in the matching str.
|
||||
|
||||
This "replace" function is intended to be use to replace in `import ...` as it
|
||||
adds a `import.meta.url` if we are in a module.
|
||||
"""
|
||||
|
||||
def f(m_object, opts):
|
||||
return m_object[0].replace(src, target) + (
|
||||
"import.meta.url, " if opts and opts.get("isModule") else '"", '
|
||||
)
|
||||
|
||||
return f
|
||||
|
||||
|
||||
def create_js_rules() -> list[TransformationRule]:
|
||||
"""
|
||||
This function create all the transformation rules.
|
||||
|
||||
A transformation rule is a tuple (Regex, rewrite_function).
|
||||
If the regex match in the rewritten script, the corresponding match object will be
|
||||
passed to rewrite_function.
|
||||
The rewrite_function must all take a `opts` dictionnary which will be the opts
|
||||
passed to the `JsRewriter.rewrite` function.
|
||||
This is mostly as if we were calling `re.sub(regex, rewrite_function, script_text)`.
|
||||
|
||||
The regex will be combined and will match any non overlaping text.
|
||||
So rule to match will be applyed, potentially preventing futher rules to match.
|
||||
"""
|
||||
|
||||
# This will replace `location = `. This will "see" with wombat and set what have to
|
||||
# be set.
|
||||
check_loc = (
|
||||
"((self.__WB_check_loc && self.__WB_check_loc(location, arguments)) || "
|
||||
"{}).href = "
|
||||
)
|
||||
|
||||
# This will replace `eval(...)`.
|
||||
eval_str = (
|
||||
"WB_wombat_runEval2((_______eval_arg, isGlobal) => { var ge = eval; return "
|
||||
"isGlobal ? ge(_______eval_arg) : "
|
||||
"eval(_______eval_arg); }).eval(this, (function() { return arguments })(),"
|
||||
)
|
||||
|
||||
return [
|
||||
# rewriting `eval(...)` - invocation
|
||||
(re.compile(r"(?:^|\s)\beval\s*\("), replace_prefix_from(eval_str, "eval")),
|
||||
# rewriting `x = eval` - no invocation
|
||||
(re.compile(r"[=]\s*\beval\b(?![(:.$])"), replace("eval", "self.eval")),
|
||||
# rewriting `.postMessage` -> `__WB_pmw(self).postMessage`
|
||||
(re.compile(r"\.postMessage\b\("), add_prefix(".__WB_pmw(self)")),
|
||||
# rewriting `location = ` to custom expression `(...).href =` assignement
|
||||
(
|
||||
re.compile(r"[^$.]?\s?\blocation\b\s*[=]\s*(?![\s\d=])"),
|
||||
add_suffix_non_prop(check_loc),
|
||||
),
|
||||
# rewriting `return this`
|
||||
(re.compile(r"\breturn\s+this\b\s*(?![\s\w.$])"), replace_this()),
|
||||
# rewriting `this.` special porperties access on new line, with ; perpended
|
||||
# if prev chars is `\n`, or if prev is not `.` or `$`, no semi
|
||||
(
|
||||
re.compile(
|
||||
rf"[^$.]\s?\bthis\b(?=(?:\.(?:{'|'.join(GLOBAL_OVERRIDES)})\b))"
|
||||
),
|
||||
replace_this_non_prop(),
|
||||
),
|
||||
# rewrite `= this` or `, this`
|
||||
(re.compile(r"[=,]\s*\bthis\b\s*(?![\s\w:.$])"), replace_this()),
|
||||
# rewrite `})(this_rw)`
|
||||
(re.compile(r"\}(?:\s*\))?\s*\(this\)"), replace_this()),
|
||||
# rewrite this in && or || expr
|
||||
(
|
||||
re.compile(r"[^|&][|&]{2}\s*this\b\s*(?)"),
|
||||
replace_this(),
|
||||
),
|
||||
# ignore `async import`.
|
||||
# As the rule will match first, it will prevent next rule matching `import` to
|
||||
# be apply to `async import`.
|
||||
(re.compile(r"async\s+import\s*\("), m2str(lambda x: x)),
|
||||
# esm dynamic import, if found, mark as module
|
||||
(
|
||||
re.compile(r"[^$.]\bimport\s*\("),
|
||||
replace_import("import", "____wb_rewrite_import__"),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
REWRITE_JS_RULES = create_js_rules()
|
||||
|
||||
|
||||
class JsRewriter(RxRewriter):
|
||||
"""
|
||||
JsRewriter is in charge of rewriting the js code stored in our zim file.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
base_href: str | None,
|
||||
notify_js_module: Callable[[ZimPath], None],
|
||||
):
|
||||
super().__init__(None)
|
||||
self.first_buff = self._init_local_declaration(GLOBAL_OVERRIDES)
|
||||
self.last_buff = "\n}"
|
||||
self.url_rewriter = url_rewriter
|
||||
self.notify_js_module = notify_js_module
|
||||
self.base_href = base_href
|
||||
|
||||
def _init_local_declaration(self, local_decls: Iterable[str]) -> str:
|
||||
"""
|
||||
Create the prefix text to add at beginning of script.
|
||||
|
||||
This will be added to script only if the script is using of the declaration in
|
||||
local_decls.
|
||||
"""
|
||||
assign_func = "_____WB$wombat$assign$function_____"
|
||||
buffer = (
|
||||
f"var {assign_func} = function(name) "
|
||||
"{return (self._wb_wombat && self._wb_wombat.local_init && "
|
||||
"self._wb_wombat.local_init(name)) || self[name]; };\n"
|
||||
"if (!self.__WB_pmw) { self.__WB_pmw = function(obj) "
|
||||
"{ this.__WB_source = obj; return this; } }\n{\n"
|
||||
)
|
||||
for decl in local_decls:
|
||||
buffer += f"""let {decl} = {assign_func}("{decl}");\n"""
|
||||
buffer += "let arguments;\n"
|
||||
return buffer + "\n"
|
||||
|
||||
def _get_module_decl(self, local_decls: Iterable[str]) -> str:
|
||||
"""
|
||||
Create the prefix text to add at beginning of module script.
|
||||
|
||||
This will be added to script only if the script is a module script.
|
||||
"""
|
||||
wb_module_decl_url = self.url_rewriter.get_document_uri(
|
||||
ZimPath("_zim_static/__wb_module_decl.js"), ""
|
||||
)
|
||||
return (
|
||||
f"""import {{ {", ".join(local_decls)} }} from "{wb_module_decl_url}";\n"""
|
||||
)
|
||||
|
||||
def rewrite(self, text: str, opts: dict[str, Any] | None = None) -> str:
|
||||
"""
|
||||
Rewrite the js code in `text`.
|
||||
"""
|
||||
opts = opts or {}
|
||||
|
||||
is_module = opts.get("isModule", False)
|
||||
|
||||
rules = REWRITE_JS_RULES[:]
|
||||
|
||||
if is_module:
|
||||
rules.append(self._get_esm_import_rule())
|
||||
|
||||
self._compile_rules(rules)
|
||||
|
||||
new_text = super().rewrite(text, opts)
|
||||
|
||||
if is_module:
|
||||
return self._get_module_decl(GLOBAL_OVERRIDES) + new_text
|
||||
|
||||
if GLOBALS_RX.search(text):
|
||||
new_text = self.first_buff + new_text + self.last_buff
|
||||
|
||||
if opts.get("inline", False):
|
||||
new_text = new_text.replace("\n", " ")
|
||||
|
||||
return new_text
|
||||
|
||||
def _get_esm_import_rule(self) -> TransformationRule:
|
||||
def get_rewriten_import_url(url):
|
||||
"""Rewrite the import URL
|
||||
|
||||
This takes into account that the result must be a relative URL, i.e. it
|
||||
cannot be 'vendor.module.js' but must be './vendor.module.js'.
|
||||
"""
|
||||
url = self.url_rewriter(url, base_href=self.base_href)
|
||||
if not (
|
||||
url.startswith("/") or url.startswith("./") or url.startswith("../")
|
||||
):
|
||||
url = "./" + url
|
||||
return url
|
||||
|
||||
def rewrite_import():
|
||||
def func(m_object, _opts):
|
||||
def sub_funct(match):
|
||||
self.notify_js_module(
|
||||
self.url_rewriter.get_item_path(
|
||||
match.group(2), base_href=self.base_href
|
||||
)
|
||||
)
|
||||
return (
|
||||
f"{match.group(1)}{get_rewriten_import_url(match.group(2))}"
|
||||
f"{match.group(3)}"
|
||||
)
|
||||
|
||||
return IMPORT_HTTP_RX.sub(sub_funct, m_object[0])
|
||||
|
||||
return func
|
||||
|
||||
return (IMPORT_MATCH_RX, rewrite_import())
|
|
@ -1,143 +0,0 @@
|
|||
import re
|
||||
from collections.abc import Callable, Iterable
|
||||
from typing import Any
|
||||
|
||||
TransformationAction = Callable[[re.Match, dict], str]
|
||||
TransformationRule = tuple[re.Pattern, TransformationAction]
|
||||
|
||||
|
||||
def m2str(function) -> TransformationAction:
|
||||
"""
|
||||
Call a rewrite_function with a string instead of a match object.
|
||||
A lot of rewrite function don't need the match object as they are working
|
||||
directly on text. This decorator can be used on rewrite_function taking a str.
|
||||
"""
|
||||
|
||||
def wrapper(m_object: re.Match, _opts: dict) -> str:
|
||||
return function(m_object[0])
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def add_around(prefix: str, suffix: str) -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function which add a `prefix` and a `suffix` around the match.
|
||||
"""
|
||||
|
||||
@m2str
|
||||
def f(x):
|
||||
return prefix + x + suffix
|
||||
|
||||
return f
|
||||
|
||||
|
||||
def add_prefix(prefix: str) -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function which add the `prefix` to the matching str.
|
||||
"""
|
||||
|
||||
return add_around(prefix, "")
|
||||
|
||||
|
||||
def add_suffix(suffix: str) -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function which add the `suffix` to the matching str.
|
||||
"""
|
||||
|
||||
return add_around("", suffix)
|
||||
|
||||
|
||||
def replace_prefix_from(prefix: str, match: str) -> TransformationAction:
|
||||
"""
|
||||
Returns a function which replaces everything before `match` with `prefix`.
|
||||
"""
|
||||
|
||||
@m2str
|
||||
def f(x) -> str:
|
||||
match_index = x.index(match)
|
||||
if match_index == 0:
|
||||
return prefix
|
||||
return x[:match_index] + prefix
|
||||
|
||||
return f
|
||||
|
||||
|
||||
def replace(src, target) -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function replacing `src` by `target` in the matching str.
|
||||
"""
|
||||
|
||||
@m2str
|
||||
def f(x):
|
||||
return x.replace(src, target)
|
||||
|
||||
return f
|
||||
|
||||
|
||||
def replace_all(text: str) -> TransformationAction:
|
||||
"""
|
||||
Create a rewrite_function which replace the whole match with text.
|
||||
"""
|
||||
|
||||
@m2str
|
||||
def f(_x):
|
||||
return text
|
||||
|
||||
return f
|
||||
|
||||
|
||||
class RxRewriter:
|
||||
"""
|
||||
RxRewriter is a generic rewriter base on regex.
|
||||
|
||||
The main "input" is a list of rules, each rule being a tuple (regex,
|
||||
rewriting_function). We want to apply each rule to the content. But doing it blindly
|
||||
is counter-productive. It would means that we have to do N replacements (N == number
|
||||
of rules).
|
||||
To avoid that, we create one unique regex (`compiled_rule`) equivalent to
|
||||
`(regex0|regex1|regex2|...)` and we do only one replacement with this regex.
|
||||
When we have a match, we do N regex search to know which rules is corresponding
|
||||
and we apply the associated rewriting_function.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
rules: Iterable[TransformationRule] | None = None,
|
||||
):
|
||||
self.rules = rules or []
|
||||
self.compiled_rule: re.Pattern | None = None
|
||||
if self.rules:
|
||||
self._compile_rules(self.rules)
|
||||
|
||||
def _compile_rules(self, rules: Iterable[TransformationRule]):
|
||||
"""
|
||||
Compile all the regex of the rules into only one `compiled_rules` pattern
|
||||
"""
|
||||
self.rules = rules
|
||||
rx_buff = "|".join(f"({rule[0].pattern})" for rule in rules)
|
||||
self.compiled_rule = re.compile(f"(?:{rx_buff})", re.M)
|
||||
|
||||
def rewrite(
|
||||
self,
|
||||
text: str | bytes,
|
||||
opts: dict[str, Any],
|
||||
) -> str:
|
||||
"""
|
||||
Apply the unique `compiled_rules` pattern and replace the content.
|
||||
"""
|
||||
if isinstance(text, bytes):
|
||||
text = text.decode()
|
||||
|
||||
def replace(m_object):
|
||||
"""
|
||||
This method search for the specific rule which have matched and apply it.
|
||||
"""
|
||||
for i, rule in enumerate(self.rules, 1):
|
||||
if not m_object.group(i):
|
||||
# THis is not the ith rules which match
|
||||
continue
|
||||
result = rule[1](m_object, opts)
|
||||
return result
|
||||
|
||||
assert self.compiled_rule is not None # noqa
|
||||
return self.compiled_rule.sub(replace, text)
|
|
@ -40,28 +40,21 @@ from jinja2 import Environment, PackageLoader
|
|||
from warcio import ArchiveIterator
|
||||
from warcio.recordloader import ArcWarcRecord
|
||||
from zimscraperlib.constants import (
|
||||
DEFAULT_DEV_ZIM_METADATA,
|
||||
RECOMMENDED_MAX_TITLE_LENGTH,
|
||||
)
|
||||
from zimscraperlib.download import stream_file
|
||||
from zimscraperlib.image.conversion import convert_image, convert_svg2png
|
||||
from zimscraperlib.image.probing import format_for
|
||||
from zimscraperlib.image.transformation import resize_image
|
||||
from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
||||
from zimscraperlib.types import FALLBACK_MIME
|
||||
from zimscraperlib.zim import metadata
|
||||
from zimscraperlib.zim.creator import Creator
|
||||
from zimscraperlib.zim.metadata import (
|
||||
validate_description,
|
||||
validate_language,
|
||||
validate_longdescription,
|
||||
validate_tags,
|
||||
validate_title,
|
||||
)
|
||||
|
||||
from warc2zim.constants import logger
|
||||
from warc2zim.icon_finder import Icon, get_sorted_icons, icons_in_html
|
||||
from warc2zim.items import StaticArticle, StaticFile, WARCPayloadItem
|
||||
from warc2zim.language import parse_language
|
||||
from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
|
||||
from warc2zim.utils import (
|
||||
can_process_status_code,
|
||||
get_record_content,
|
||||
|
@ -140,7 +133,9 @@ class Converter:
|
|||
}
|
||||
self.source: str | None = str(args.source) if args.source else None or main_url
|
||||
self.scraper = "warc2zim " + get_version()
|
||||
self.main_path = normalize(HttpUrl(main_url)) if main_url else None
|
||||
self.main_path = (
|
||||
ArticleUrlRewriter.normalize(HttpUrl(main_url)) if main_url else None
|
||||
)
|
||||
|
||||
self.output = Path(args.output)
|
||||
self.zim_file = args.zim_file
|
||||
|
@ -271,16 +266,16 @@ class Converter:
|
|||
if not self.disable_metadata_checks:
|
||||
# Validate ZIM metadata early so that we do not waste time doing operations
|
||||
# for a scraper which will fail anyway in the end
|
||||
validate_tags("Tags", self.tags)
|
||||
metadata.TagsMetadata(self.tags)
|
||||
if self.title:
|
||||
validate_title("Title", self.title)
|
||||
metadata.TitleMetadata(self.title)
|
||||
if self.description:
|
||||
validate_description("Description", self.description)
|
||||
metadata.DescriptionMetadata(self.description)
|
||||
if self.long_description:
|
||||
validate_longdescription("LongDescription", self.long_description)
|
||||
metadata.LongDescriptionMetadata(self.long_description)
|
||||
if self.language:
|
||||
self.language = parse_language(self.language)
|
||||
validate_language("Language", self.language)
|
||||
metadata.LanguageMetadata(self.language)
|
||||
# Nota: we do not validate illustration since logic in the scraper is made
|
||||
# to always provide a valid image, at least a fallback transparent PNG and
|
||||
# final illustration is most probably not yet known at this stage
|
||||
|
@ -303,7 +298,7 @@ class Converter:
|
|||
|
||||
self.language = "eng"
|
||||
# validate language definitely, could have been retrieved from WARC or fallback
|
||||
validate_language("Language", self.language)
|
||||
metadata.LanguageMetadata(self.language)
|
||||
if not self.main_path:
|
||||
raise ValueError("Unable to find main path, aborting")
|
||||
self.title = self.title or "Untitled"
|
||||
|
@ -335,43 +330,64 @@ class Converter:
|
|||
)
|
||||
|
||||
self.creator.config_metadata(
|
||||
Name=self.name,
|
||||
Language=self.language or "eng",
|
||||
Title=self.title,
|
||||
Description=self.description,
|
||||
LongDescription=self.long_description,
|
||||
Creator=self.creator_metadata,
|
||||
Publisher=self.publisher,
|
||||
Date=datetime.date.today(), # noqa: DTZ011
|
||||
Illustration_48x48_at_1=self.illustration,
|
||||
Tags=self.tags,
|
||||
Source=self.source,
|
||||
Scraper=",".join(
|
||||
filter(
|
||||
lambda x: x, # remove None values
|
||||
[
|
||||
f"warc2zim {get_version()}",
|
||||
self.warc_software,
|
||||
self.scraper_suffix,
|
||||
],
|
||||
)
|
||||
metadata.StandardMetadataList(
|
||||
Name=metadata.NameMetadata(self.name),
|
||||
Language=metadata.LanguageMetadata(self.language),
|
||||
Title=metadata.TitleMetadata(self.title),
|
||||
Description=metadata.DescriptionMetadata(self.description),
|
||||
LongDescription=(
|
||||
metadata.LongDescriptionMetadata(self.long_description)
|
||||
if self.long_description
|
||||
else None
|
||||
),
|
||||
Creator=metadata.CreatorMetadata(self.creator_metadata),
|
||||
Publisher=metadata.PublisherMetadata(self.publisher),
|
||||
Date=metadata.DateMetadata(
|
||||
datetime.datetime.now(tz=datetime.UTC).date()
|
||||
),
|
||||
Illustration_48x48_at_1=metadata.DefaultIllustrationMetadata(
|
||||
self.illustration
|
||||
),
|
||||
Tags=(metadata.TagsMetadata(self.tags) if self.tags else None),
|
||||
Scraper=metadata.ScraperMetadata(
|
||||
",".join(
|
||||
filter(
|
||||
lambda x: x, # remove None values
|
||||
[
|
||||
f"warc2zim {get_version()}",
|
||||
self.warc_software,
|
||||
self.scraper_suffix,
|
||||
],
|
||||
)
|
||||
)
|
||||
),
|
||||
),
|
||||
).start()
|
||||
|
||||
if self.warc_start and self.warc_end:
|
||||
if self.warc_start == self.warc_end:
|
||||
self.creator.add_metadata(
|
||||
"X-ContentDate", self.warc_start.strftime("%Y-%m-%d")
|
||||
metadata.XCustomTextMetadata(
|
||||
"X-ContentDate", self.warc_start.strftime("%Y-%m-%d")
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.creator.add_metadata(
|
||||
"X-ContentDate",
|
||||
f"{self.warc_start.strftime('%Y-%m-%d')},"
|
||||
f"{self.warc_end.strftime('%Y-%m-%d')}",
|
||||
metadata.XCustomTextMetadata(
|
||||
"X-ContentDate",
|
||||
f"{self.warc_start.strftime('%Y-%m-%d')},"
|
||||
f"{self.warc_end.strftime('%Y-%m-%d')}",
|
||||
)
|
||||
)
|
||||
|
||||
for filename in importlib.resources.files("warc2zim.statics").iterdir():
|
||||
for filename in importlib.resources.files(
|
||||
"zimscraperlib.rewriting.statics"
|
||||
).iterdir():
|
||||
if not filename.is_file():
|
||||
continue
|
||||
with importlib.resources.as_file(filename) as file:
|
||||
if file.suffix != ".js":
|
||||
continue
|
||||
self.creator.add_item(
|
||||
StaticArticle(filename=file, main_path=self.main_path.value)
|
||||
)
|
||||
|
@ -474,7 +490,7 @@ class Converter:
|
|||
if not (url.startswith("http://") or url.startswith("https://")):
|
||||
continue
|
||||
|
||||
zim_path = normalize(HttpUrl(url))
|
||||
zim_path = ArticleUrlRewriter.normalize(HttpUrl(url))
|
||||
|
||||
status_code = get_status_code(record)
|
||||
if not can_process_status_code(status_code):
|
||||
|
@ -493,7 +509,7 @@ class Converter:
|
|||
if zim_path not in self.redirections:
|
||||
if redirect_location := record.http_headers.get("Location"):
|
||||
try:
|
||||
redirection_zim_path = normalize(
|
||||
redirection_zim_path = ArticleUrlRewriter.normalize(
|
||||
HttpUrl(urljoin(url, redirect_location))
|
||||
)
|
||||
# Redirection to same ZIM path have to be ignored (occurs
|
||||
|
@ -563,7 +579,7 @@ class Converter:
|
|||
HTTPStatus.FOUND,
|
||||
]:
|
||||
original_path = self.main_path
|
||||
self.main_path = normalize(
|
||||
self.main_path = ArticleUrlRewriter.normalize(
|
||||
HttpUrl(
|
||||
urljoin(
|
||||
get_record_url(record),
|
||||
|
@ -708,7 +724,8 @@ class Converter:
|
|||
# compute paths of favicons so that we can process them on-the-fly while
|
||||
# iterating the records
|
||||
self.favicon_paths = {
|
||||
normalize(icon_url): icon_url for icon_url in self.favicon_urls
|
||||
ArticleUrlRewriter.normalize(icon_url): icon_url
|
||||
for icon_url in self.favicon_urls
|
||||
}
|
||||
self.favicon_contents: dict[HttpUrl, bytes | None] = {
|
||||
icon_url: None for icon_url in self.favicon_urls
|
||||
|
@ -875,7 +892,9 @@ class Converter:
|
|||
|
||||
# Or fallback to default ZIM illustration
|
||||
logger.warning("No suitable illustration found, using default")
|
||||
self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"]
|
||||
self.illustration = (
|
||||
metadata.DEFAULT_DEV_ZIM_METADATA.Illustration_48x48_at_1.value
|
||||
)
|
||||
|
||||
def is_self_redirect(self, record, url):
|
||||
if record.rec_type != "response":
|
||||
|
@ -889,7 +908,9 @@ class Converter:
|
|||
|
||||
location = record.http_headers.get("Location", "")
|
||||
location = urljoin(url, location)
|
||||
return normalize(HttpUrl(url)) == normalize(HttpUrl(location))
|
||||
return ArticleUrlRewriter.normalize(
|
||||
HttpUrl(url)
|
||||
) == ArticleUrlRewriter.normalize(HttpUrl(location))
|
||||
|
||||
def add_items_for_warc_record(self, record):
|
||||
|
||||
|
@ -908,7 +929,7 @@ class Converter:
|
|||
logger.debug(f"Skipping record with non HTTP(S) WARC-Target-URI {url}")
|
||||
return
|
||||
|
||||
item_zim_path = normalize(HttpUrl(url))
|
||||
item_zim_path = ArticleUrlRewriter.normalize(HttpUrl(url))
|
||||
|
||||
# if include_domains is set, only include urls from those domains
|
||||
if self.include_domains:
|
||||
|
@ -981,7 +1002,7 @@ class Converter:
|
|||
and record.rec_headers["WARC-Refers-To-Target-URI"] != url
|
||||
and item_zim_path not in self.revisits
|
||||
): # pragma: no branch
|
||||
self.revisits[item_zim_path] = normalize(
|
||||
self.revisits[item_zim_path] = ArticleUrlRewriter.normalize(
|
||||
HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"])
|
||||
)
|
||||
|
||||
|
|
|
@ -11,11 +11,11 @@ from pathlib import Path
|
|||
from jinja2.environment import Template
|
||||
from libzim.writer import Hint # pyright: ignore[reportMissingModuleSource]
|
||||
from warcio.recordloader import ArcWarcRecord
|
||||
from zimscraperlib.rewriting.url_rewriting import ZimPath
|
||||
from zimscraperlib.types import get_mime_for_name
|
||||
from zimscraperlib.zim.items import StaticItem
|
||||
|
||||
from warc2zim.content_rewriting.generic import Rewriter
|
||||
from warc2zim.url_rewriting import ZimPath
|
||||
from warc2zim.rewriting import Rewriter
|
||||
from warc2zim.utils import get_record_mime_type
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from zimscraperlib.i18n import get_language_details
|
||||
from zimscraperlib.i18n import get_language_or_none
|
||||
|
||||
from warc2zim.constants import logger
|
||||
|
||||
|
@ -13,17 +13,19 @@ def parse_language(input_lang: str) -> str:
|
|||
Preserve language ordering (since it conveys meaning in ZIM metadata).
|
||||
"""
|
||||
|
||||
langs = [] # use a list to preserve order
|
||||
# transform input language into Language object (or None if not found)
|
||||
langs = [get_language_or_none(lang.strip()) for lang in input_lang.split(",")]
|
||||
|
||||
for lang in [lang.strip() for lang in input_lang.split(",")]:
|
||||
try:
|
||||
lang_data = get_language_details(lang)
|
||||
if parsed_lang := (lang_data.iso_639_3 if lang_data else None):
|
||||
if parsed_lang not in langs:
|
||||
langs.append(parsed_lang)
|
||||
except Exception:
|
||||
logger.warning(f"Skipping invalid language setting `{lang}`.")
|
||||
continue # skip unrecognized
|
||||
# get unique iso_639_3 codes, removing duplicates and None values, preserving order
|
||||
langs = list(
|
||||
dict.fromkeys(
|
||||
[
|
||||
lang.iso_639_3
|
||||
for lang in langs
|
||||
if lang is not None and lang.iso_639_3 is not None
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
if len(langs) == 0:
|
||||
logger.warning(
|
||||
|
|
|
@ -5,12 +5,12 @@ from urllib.parse import quote, urlsplit
|
|||
|
||||
from jinja2.environment import Template
|
||||
from warcio.recordloader import ArcWarcRecord
|
||||
from zimscraperlib.rewriting.css import CssRewriter
|
||||
from zimscraperlib.rewriting.html import HtmlRewriter
|
||||
from zimscraperlib.rewriting.js import JsRewriter
|
||||
from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
||||
|
||||
from warc2zim.constants import logger
|
||||
from warc2zim.content_rewriting.css import CssRewriter
|
||||
from warc2zim.content_rewriting.html import HtmlRewriter
|
||||
from warc2zim.content_rewriting.js import JsRewriter
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
||||
from warc2zim.utils import (
|
||||
get_record_content,
|
||||
get_record_encoding,
|
||||
|
@ -76,7 +76,9 @@ class Rewriter:
|
|||
self.path = path
|
||||
self.orig_url_str = get_record_url(record)
|
||||
self.url_rewriter = ArticleUrlRewriter(
|
||||
HttpUrl(self.orig_url_str), existing_zim_paths, missing_zim_paths
|
||||
article_url=HttpUrl(self.orig_url_str),
|
||||
existing_zim_paths=existing_zim_paths,
|
||||
missing_zim_paths=missing_zim_paths,
|
||||
)
|
||||
|
||||
self.rewrite_mode = self.get_rewrite_mode(record, mimetype)
|
|
@ -1,36 +0,0 @@
|
|||
var wrapObj = function (name) {
|
||||
return (
|
||||
(self._wb_wombat &&
|
||||
self._wb_wombat.local_init &&
|
||||
self._wb_wombat.local_init(name)) ||
|
||||
self[name]
|
||||
);
|
||||
};
|
||||
if (!self.__WB_pmw) {
|
||||
self.__WB_pmw = function (obj) {
|
||||
this.__WB_source = obj;
|
||||
return this;
|
||||
};
|
||||
}
|
||||
|
||||
const window = wrapObj("window");
|
||||
const document = wrapObj("document");
|
||||
const location = wrapObj("location");
|
||||
const top = wrapObj("top");
|
||||
const parent = wrapObj("parent");
|
||||
const frames = wrapObj("frames");
|
||||
const opener = wrapObj("opener");
|
||||
const __self = wrapObj("self");
|
||||
const __globalThis = wrapObj("globalThis");
|
||||
|
||||
export {
|
||||
window,
|
||||
document,
|
||||
location,
|
||||
top,
|
||||
parent,
|
||||
frames,
|
||||
opener,
|
||||
__self as self,
|
||||
__globalThis as globalThis,
|
||||
};
|
|
@ -1,370 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# vim: ai ts=4 sts=4 et sw=4 nu
|
||||
|
||||
""" warc2zim's url rewriting tools
|
||||
|
||||
This module is about url and entry path rewriting.
|
||||
|
||||
The global scheme is the following:
|
||||
|
||||
Entries are stored in the ZIM file using their decoded fully decoded path:
|
||||
- The full path is the full url without the scheme, username, password, port, fragment
|
||||
(ie : "<host>/<path>(?<query_string)"). See documentation of the `normalize` function
|
||||
for more details.
|
||||
- urldecoded: the path itself must not be urlencoded or it would conflict with ZIM
|
||||
specification and readers won't be able to retrieve it, some parts (e.g. querystring)
|
||||
might be absorbed by a web server, ...
|
||||
. This is valid : "foo/part with space/bar?key=value"
|
||||
. This is NOT valid : "foo/part%20with%20space/bar%3Fkey%3Dvalue"
|
||||
- even having multiple ? in a ZIM path is valid
|
||||
. This is valid :
|
||||
"foo/part/file with ? and +?who=Chip&Dale&question=It there any + here?"
|
||||
. This is NOT valid :
|
||||
"foo/part/file with %3F and +?who=Chip%26Dale&quer=Is%20there%20any%20%2B%20here%3F"
|
||||
- space in query string must be stored as ` `, not `%2B`, `%20` or `+`, the `+` in a ZIM
|
||||
path means a `%2B in web resource (HTML document, ...):
|
||||
. This is valid : "foo/part/file?question=Is there any + here?"
|
||||
. This is NOT valid : "foo/part/file?question%3DIs%20there%20any%20%2B%20here%3F"
|
||||
|
||||
On top of that, fuzzy rules are applied on the ZIM path:
|
||||
For instance a path "https://www.youtube.com/youtubei/v1/foo/baz/things?key=value
|
||||
&other_key=other_value&videoId=xxxx&yet_another_key=yet_another_value"
|
||||
is transformed to "youtube.fuzzy.replayweb.page/youtubei/v1/foo/baz/things?videoId=xxxx"
|
||||
by slightly simplifying the path and keeping only the usefull arguments in the
|
||||
querystring.
|
||||
|
||||
When rewriting documents (HTML, CSS, JS, ...), every time we find a URI to rewrite we
|
||||
start by resolving it into an absolute URL (based on the containing document absolute
|
||||
URI), applying the transformation to compute the corresponding ZIM path and we
|
||||
url-encode the whole ZIM path, so that readers will have one single blob to process,
|
||||
url-decode and find corresponding ZIM entry. Only '/' separators are considered safe
|
||||
and not url-encoded.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import PurePosixPath
|
||||
from urllib.parse import quote, unquote, urljoin, urlsplit, urlunsplit
|
||||
|
||||
import idna
|
||||
|
||||
from warc2zim.constants import logger
|
||||
from warc2zim.rules import FUZZY_RULES
|
||||
|
||||
COMPILED_FUZZY_RULES = [
|
||||
{"match": re.compile(rule["pattern"]), "replace": rule["replace"]}
|
||||
for rule in FUZZY_RULES
|
||||
]
|
||||
|
||||
|
||||
class HttpUrl:
|
||||
"""A utility class representing an HTTP url, usefull to pass this data around
|
||||
|
||||
Includes a basic validation, ensuring that URL is encoded, scheme is provided.
|
||||
"""
|
||||
|
||||
def __init__(self, value: str) -> None:
|
||||
HttpUrl.check_validity(value)
|
||||
self._value = value
|
||||
|
||||
def __eq__(self, __value: object) -> bool:
|
||||
return isinstance(__value, HttpUrl) and __value.value == self.value
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return self.value.__hash__()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"HttpUrl({self.value})"
|
||||
|
||||
@property
|
||||
def value(self) -> str:
|
||||
return self._value
|
||||
|
||||
@classmethod
|
||||
def check_validity(cls, value: str) -> None:
|
||||
parts = urlsplit(value)
|
||||
|
||||
if parts.scheme.lower() not in ["http", "https"]:
|
||||
raise ValueError(
|
||||
f"Incorrect HttpUrl scheme in value: {value} {parts.scheme}"
|
||||
)
|
||||
|
||||
if not parts.hostname:
|
||||
raise ValueError(f"Unsupported empty hostname in value: {value}")
|
||||
|
||||
if parts.hostname.lower() != parts.hostname:
|
||||
raise ValueError(f"Unsupported upper-case chars in hostname : {value}")
|
||||
|
||||
|
||||
class ZimPath:
|
||||
"""A utility class representing a ZIM path, usefull to pass this data around
|
||||
|
||||
Includes a basic validation, ensuring that path does start with scheme, hostname,...
|
||||
"""
|
||||
|
||||
def __init__(self, value: str) -> None:
|
||||
ZimPath.check_validity(value)
|
||||
self._value = value
|
||||
|
||||
def __eq__(self, __value: object) -> bool:
|
||||
return isinstance(__value, ZimPath) and __value.value == self.value
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return self.value.__hash__()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"ZimPath({self.value})"
|
||||
|
||||
@property
|
||||
def value(self) -> str:
|
||||
return self._value
|
||||
|
||||
@classmethod
|
||||
def check_validity(cls, value: str) -> None:
|
||||
parts = urlsplit(value)
|
||||
|
||||
if parts.scheme:
|
||||
raise ValueError(f"Unexpected scheme in value: {value} {parts.scheme}")
|
||||
|
||||
if parts.hostname:
|
||||
raise ValueError(f"Unexpected hostname in value: {value} {parts.hostname}")
|
||||
|
||||
if parts.username:
|
||||
raise ValueError(f"Unexpected username in value: {value} {parts.username}")
|
||||
|
||||
if parts.password:
|
||||
raise ValueError(f"Unexpected password in value: {value} {parts.password}")
|
||||
|
||||
|
||||
def apply_fuzzy_rules(uri: HttpUrl | str) -> str:
|
||||
"""Apply fuzzy rules on a URL or relative path
|
||||
|
||||
First matching fuzzy rule matching the input value is applied and its result
|
||||
is returned.
|
||||
|
||||
If no fuzzy rule is matching, the input is returned as-is.
|
||||
"""
|
||||
value = uri.value if isinstance(uri, HttpUrl) else uri
|
||||
for rule in COMPILED_FUZZY_RULES:
|
||||
if match := rule["match"].match(value):
|
||||
return match.expand(rule["replace"])
|
||||
return value
|
||||
|
||||
|
||||
def normalize(url: HttpUrl) -> ZimPath:
|
||||
"""Transform a HTTP URL into a ZIM path to use as a entry's key.
|
||||
|
||||
According to RFC 3986, a URL allows only a very limited set of characters, so we
|
||||
assume by default that the url is encoded to match this specification.
|
||||
|
||||
The transformation rewrites the hostname, the path and the querystring.
|
||||
|
||||
The transformation drops the URL scheme, username, password, port and fragment:
|
||||
- we suppose there is no conflict of URL scheme or port: there is no two ressources
|
||||
with same hostname, path and querystring but different URL scheme or port leading
|
||||
to different content
|
||||
- we consider username/password port are purely authentication mechanism which have
|
||||
no impact on the content to server
|
||||
- we know that the fragment is never passed to the server, it stays in the
|
||||
User-Agent, so if we encounter a fragment while normalizing a URL found in a
|
||||
document, it won't make its way to the ZIM anyway and will stay client-side
|
||||
|
||||
The transformation consists mainly in decoding the three components so that ZIM path
|
||||
is not encoded at all, as required by the ZIM specification.
|
||||
|
||||
Decoding is done differently for the hostname (decoded with puny encoding) and the
|
||||
path and querystring (both decoded with url decoding).
|
||||
|
||||
The final transformation is the application of fuzzy rules (sourced from wabac) to
|
||||
transform some URLs into replay URLs and drop some useless stuff.
|
||||
|
||||
Returned value is a ZIM path, without any puny/url encoding applied, ready to be
|
||||
passed to python-libzim for UTF-8 encoding.
|
||||
"""
|
||||
|
||||
url_parts = urlsplit(url.value)
|
||||
|
||||
if not url_parts.hostname:
|
||||
raise Exception("Hostname is missing")
|
||||
|
||||
# decode the hostname if it is punny-encoded
|
||||
hostname = (
|
||||
idna.decode(url_parts.hostname)
|
||||
if url_parts.hostname.startswith("xn--")
|
||||
else url_parts.hostname
|
||||
)
|
||||
|
||||
path = url_parts.path
|
||||
|
||||
if path:
|
||||
# unquote the path so that it is stored unencoded in the ZIM as required by ZIM
|
||||
# specification
|
||||
path = unquote(path)
|
||||
else:
|
||||
# if path is empty, we need a "/" to remove ambiguities, e.g. https://example.com
|
||||
# and https://example.com/ must all lead to the same ZIM entry to match RFC 3986
|
||||
# section 6.2.3 : https://www.rfc-editor.org/rfc/rfc3986#section-6.2.3
|
||||
path = "/"
|
||||
|
||||
query = url_parts.query
|
||||
|
||||
# if query is missing, we do not add it at all, not even a trailing ? without
|
||||
# anything after it
|
||||
if url_parts.query:
|
||||
# `+`` in query parameter must be decoded as space first to remove ambiguities
|
||||
# between a space (encoded as `+` in url query parameter) and a real plus sign
|
||||
# (encoded as %2B but soon decoded in ZIM path)
|
||||
query = query.replace("+", " ")
|
||||
# unquote the query so that it is stored unencoded in the ZIM as required by ZIM
|
||||
# specification
|
||||
query = "?" + unquote(query)
|
||||
else:
|
||||
query = ""
|
||||
|
||||
fuzzified_url = apply_fuzzy_rules(
|
||||
f"{hostname}{_remove_subsequent_slashes(path)}{_remove_subsequent_slashes(query)}"
|
||||
)
|
||||
|
||||
return ZimPath(fuzzified_url)
|
||||
|
||||
|
||||
def _remove_subsequent_slashes(value: str) -> str:
|
||||
"""Remove all successive occurence of a slash `/` in a given string
|
||||
|
||||
E.g `val//ue` or `val///ue` or `val////ue` (and so on) are transformed into `value`
|
||||
"""
|
||||
return re.sub(r"//+", "/", value)
|
||||
|
||||
|
||||
def get_without_fragment(url: str) -> str:
|
||||
parsed = urlsplit(url)
|
||||
return urlunsplit(parsed._replace(fragment=""))
|
||||
|
||||
|
||||
class ArticleUrlRewriter:
|
||||
"""Rewrite urls in article."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
article_url: HttpUrl,
|
||||
existing_zim_paths: set[ZimPath],
|
||||
missing_zim_paths: set[ZimPath] | None = None,
|
||||
):
|
||||
self.article_path = normalize(article_url)
|
||||
self.article_url = article_url
|
||||
self.existing_zim_paths = existing_zim_paths
|
||||
self.missing_zim_paths = missing_zim_paths
|
||||
|
||||
def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
|
||||
"""Utility to transform an item URL into a ZimPath"""
|
||||
|
||||
item_absolute_url = urljoin(
|
||||
urljoin(self.article_url.value, base_href), item_url
|
||||
)
|
||||
return normalize(HttpUrl(item_absolute_url))
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
item_url: str,
|
||||
base_href: str | None,
|
||||
*,
|
||||
rewrite_all_url: bool = True,
|
||||
) -> str:
|
||||
"""Rewrite a url contained in a article.
|
||||
|
||||
The url is "fully" rewrited to point to a normalized entry path
|
||||
"""
|
||||
|
||||
try:
|
||||
item_url = item_url.strip()
|
||||
|
||||
# Make case of standalone fragments more straightforward
|
||||
if item_url.startswith("#"):
|
||||
return item_url
|
||||
|
||||
item_scheme = urlsplit(item_url).scheme
|
||||
if item_scheme and item_scheme not in ("http", "https"):
|
||||
return item_url
|
||||
|
||||
item_absolute_url = urljoin(
|
||||
urljoin(self.article_url.value, base_href), item_url
|
||||
)
|
||||
|
||||
item_fragment = urlsplit(item_absolute_url).fragment
|
||||
|
||||
item_path = normalize(HttpUrl(item_absolute_url))
|
||||
|
||||
if rewrite_all_url or item_path in self.existing_zim_paths:
|
||||
return self.get_document_uri(item_path, item_fragment)
|
||||
else:
|
||||
if (
|
||||
self.missing_zim_paths is not None
|
||||
and item_path not in self.missing_zim_paths
|
||||
):
|
||||
logger.debug(f"WARNING {item_path} ({item_url}) not in archive.")
|
||||
# maintain a collection of missing Zim Path to not fill the logs
|
||||
# with duplicate messages
|
||||
self.missing_zim_paths.add(item_path)
|
||||
# The url doesn't point to a known entry
|
||||
return item_absolute_url
|
||||
|
||||
except Exception as exc:
|
||||
item_scheme = item_scheme if "item_scheme" in locals() else "<not_set>"
|
||||
item_absolute_url = (
|
||||
item_absolute_url if "item_absolute_url" in locals() else "<not_set>"
|
||||
)
|
||||
item_fragment = (
|
||||
item_fragment if "item_fragment" in locals() else "<not_set>"
|
||||
)
|
||||
item_path = item_path if "item_path" in locals() else "<not_set>"
|
||||
logger.debug(
|
||||
f"Invalid URL value found in {self.article_url.value}, kept as-is. "
|
||||
f"(item_url: {item_url}, "
|
||||
f"item_scheme: {item_scheme}, "
|
||||
f"item_absolute_url: {item_absolute_url}, "
|
||||
f"item_fragment: {item_fragment}, "
|
||||
f"item_path: {item_path}, "
|
||||
f"rewrite_all_url: {rewrite_all_url}",
|
||||
exc_info=exc,
|
||||
)
|
||||
return item_url
|
||||
|
||||
def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
|
||||
"""Given an ZIM item path and its fragment, get the URI to use in document
|
||||
|
||||
This function transforms the path of a ZIM item we want to adress from current
|
||||
document (HTML / JS / ...) and returns the corresponding URI to use.
|
||||
|
||||
It computes the relative path based on current document location and escape
|
||||
everything which needs to be to transform the ZIM path into a valid RFC 3986 URI
|
||||
|
||||
It also append a potential trailing item fragment at the end of the resulting
|
||||
URI.
|
||||
|
||||
"""
|
||||
item_parts = urlsplit(item_path.value)
|
||||
|
||||
# item_path is both path + querystring, both will be url-encoded in the document
|
||||
# so that readers consider them as a whole and properly pass them to libzim
|
||||
item_url = item_parts.path
|
||||
if item_parts.query:
|
||||
item_url += "?" + item_parts.query
|
||||
relative_path = str(
|
||||
PurePosixPath(item_url).relative_to(
|
||||
(
|
||||
PurePosixPath(self.article_path.value)
|
||||
if self.article_path.value.endswith("/")
|
||||
else PurePosixPath(self.article_path.value).parent
|
||||
),
|
||||
walk_up=True,
|
||||
)
|
||||
)
|
||||
# relative_to removes a potential last '/' in the path, we add it back
|
||||
if item_path.value.endswith("/"):
|
||||
relative_path += "/"
|
||||
|
||||
return (
|
||||
f"{quote(relative_path, safe='/')}"
|
||||
f"{'#' + item_fragment if item_fragment else ''}"
|
||||
)
|
|
@ -1,88 +0,0 @@
|
|||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from warc2zim.content_rewriting.css import CssRewriter
|
||||
from warc2zim.content_rewriting.js import JsRewriter
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def no_js_notify():
|
||||
"""Fixture to not care about notification of detection of a JS file"""
|
||||
|
||||
def no_js_notify_handler(_: str):
|
||||
pass
|
||||
|
||||
yield no_js_notify_handler
|
||||
|
||||
|
||||
class SimpleUrlRewriter(ArticleUrlRewriter):
|
||||
"""Basic URL rewriter mocking most calls"""
|
||||
|
||||
def __init__(self, article_url: HttpUrl, suffix: str = ""):
|
||||
self.article_url = article_url
|
||||
self.suffix = suffix
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
item_url: str,
|
||||
base_href: str | None, # noqa: ARG002
|
||||
*,
|
||||
rewrite_all_url: bool = True, # noqa: ARG002
|
||||
) -> str:
|
||||
return item_url + self.suffix
|
||||
|
||||
def get_item_path(
|
||||
self, item_url: str, base_href: str | None # noqa: ARG002
|
||||
) -> ZimPath:
|
||||
return ZimPath("")
|
||||
|
||||
def get_document_uri(
|
||||
self, item_path: ZimPath, item_fragment: str # noqa: ARG002
|
||||
) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def simple_url_rewriter():
|
||||
"""Fixture to create a basic url rewriter returning URLs as-is"""
|
||||
|
||||
def get_simple_url_rewriter(url: str, suffix: str = ""):
|
||||
return SimpleUrlRewriter(HttpUrl(url), suffix=suffix)
|
||||
|
||||
yield get_simple_url_rewriter
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def js_rewriter():
|
||||
"""Fixture to create a basic url rewriter returning URLs as-is"""
|
||||
|
||||
def get_js_rewriter(
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
base_href: str | None,
|
||||
notify_js_module: Callable[[ZimPath], None],
|
||||
):
|
||||
return JsRewriter(
|
||||
url_rewriter=url_rewriter,
|
||||
base_href=base_href,
|
||||
notify_js_module=notify_js_module,
|
||||
)
|
||||
|
||||
yield get_js_rewriter
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def css_rewriter():
|
||||
"""Fixture to create a basic url rewriter returning URLs as-is"""
|
||||
|
||||
def get_css_rewriter(
|
||||
url_rewriter: ArticleUrlRewriter,
|
||||
base_href: str | None,
|
||||
):
|
||||
return CssRewriter(
|
||||
url_rewriter=url_rewriter,
|
||||
base_href=base_href,
|
||||
)
|
||||
|
||||
yield get_css_rewriter
|
|
@ -1,158 +0,0 @@
|
|||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
|
||||
from warc2zim.content_rewriting.css import CssRewriter
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl
|
||||
|
||||
from .utils import ContentForTests
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
ContentForTests(b"p { color: red; }"),
|
||||
ContentForTests(b"p {\n color: red;\n}"),
|
||||
ContentForTests(b"p { background: blue; }"),
|
||||
ContentForTests(b"p { background: rgb(15, 0, 52); }"),
|
||||
ContentForTests(
|
||||
b"/* See bug issue at http://exemple.com/issue/link */ p { color: blue; }"
|
||||
),
|
||||
ContentForTests(
|
||||
b"p { width= } div { background: url(http://exemple.com/img.png)}",
|
||||
b"p { width= } div { background: url(../exemple.com/img.png)}",
|
||||
),
|
||||
ContentForTests(
|
||||
b"p { width= } div { background: url('http://exemple.com/img.png')}",
|
||||
b'p { width= } div { background: url("../exemple.com/img.png")}',
|
||||
),
|
||||
ContentForTests(
|
||||
b'p { width= } div { background: url("http://exemple.com/img.png")}',
|
||||
b'p { width= } div { background: url("../exemple.com/img.png")}',
|
||||
),
|
||||
]
|
||||
)
|
||||
def no_rewrite_content(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_no_rewrite(no_rewrite_content):
|
||||
assert (
|
||||
CssRewriter(
|
||||
ArticleUrlRewriter(
|
||||
HttpUrl(f"http://{no_rewrite_content.article_url}"), set()
|
||||
),
|
||||
base_href=None,
|
||||
).rewrite(no_rewrite_content.input_bytes)
|
||||
== no_rewrite_content.expected_bytes.decode()
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
ContentForTests('"border:'),
|
||||
ContentForTests("border: solid 1px #c0c0c0; width= 100%"),
|
||||
# Despite being invalid, tinycss parse it as "width" property without value.
|
||||
ContentForTests("width:", "width:;"),
|
||||
ContentForTests("border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"),
|
||||
ContentForTests(
|
||||
'background: url("http://exemple.com/foo.png"); width=',
|
||||
'background: url("../exemple.com/foo.png"); width=',
|
||||
),
|
||||
]
|
||||
)
|
||||
def invalid_content_inline(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_invalid_css_inline(invalid_content_inline):
|
||||
assert (
|
||||
CssRewriter(
|
||||
ArticleUrlRewriter(
|
||||
HttpUrl(f"http://{invalid_content_inline.article_url}"), set()
|
||||
),
|
||||
base_href=None,
|
||||
).rewrite_inline(invalid_content_inline.input_str)
|
||||
== invalid_content_inline.expected_str
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
# Tinycss parse `"border:}` as a string with an unexpected eof in string.
|
||||
# At serialization, tiny try to recover and close the opened rule
|
||||
ContentForTests(b'p {"border:}', b'p {"border:}}'),
|
||||
ContentForTests(b'"p {border:}'),
|
||||
ContentForTests(b"p { border: solid 1px #c0c0c0; width= 100% }"),
|
||||
ContentForTests(b"p { width: }"),
|
||||
ContentForTests(
|
||||
b"p { border-bottom-width: 1px;border-bottom-color: #c0c0c0;w }"
|
||||
),
|
||||
ContentForTests(
|
||||
b'p { background: url("http://exemple.com/foo.png"); width= }',
|
||||
b'p { background: url("../exemple.com/foo.png"); width= }',
|
||||
),
|
||||
]
|
||||
)
|
||||
def invalid_content(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_invalid_cssl(invalid_content):
|
||||
assert (
|
||||
CssRewriter(
|
||||
ArticleUrlRewriter(HttpUrl(f"http://{invalid_content.article_url}"), set()),
|
||||
base_href=None,
|
||||
).rewrite(invalid_content.input_bytes)
|
||||
== invalid_content.expected_bytes.decode()
|
||||
)
|
||||
|
||||
|
||||
def test_rewrite():
|
||||
content = b"""
|
||||
/* A comment with a link : http://foo.com */
|
||||
@import url(//fonts.googleapis.com/icon?family=Material+Icons);
|
||||
|
||||
p, input {
|
||||
color: rbg(1, 2, 3);
|
||||
background: url('http://kiwix.org/super/img');
|
||||
background-image:url('http://exemple.com/no_space_before_url');
|
||||
}
|
||||
|
||||
@font-face {
|
||||
src: url(https://f.gst.com/s/qa/v31/6xKtdSZaE8KbpRA_hJFQNcOM.woff2) format('woff2');
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 40em) {
|
||||
p, input {
|
||||
background-image:url();
|
||||
}
|
||||
}"""
|
||||
|
||||
expected = """
|
||||
/* A comment with a link : http://foo.com */
|
||||
@import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%20Icons);
|
||||
|
||||
p, input {
|
||||
color: rbg(1, 2, 3);
|
||||
background: url("super/img");
|
||||
background-image:url("../exemple.com/no_space_before_url");
|
||||
}
|
||||
|
||||
@font-face {
|
||||
src: url(../f.gst.com/s/qa/v31/6xKtdSZaE8KbpRA_hJFQNcOM.woff2) format("woff2");
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 40em) {
|
||||
p, input {
|
||||
background-image:url();
|
||||
}
|
||||
}"""
|
||||
expected = dedent(expected)
|
||||
|
||||
assert (
|
||||
CssRewriter(
|
||||
ArticleUrlRewriter(HttpUrl("http://kiwix.org/article"), set()),
|
||||
base_href=None,
|
||||
).rewrite(content)
|
||||
== expected
|
||||
)
|
File diff suppressed because it is too large
Load diff
|
@ -1,319 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from warc2zim.content_rewriting.js import JsRewriter
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
||||
|
||||
from .utils import ContentForTests
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def simple_js_rewriter(simple_url_rewriter, no_js_notify) -> JsRewriter:
|
||||
return JsRewriter(
|
||||
url_rewriter=simple_url_rewriter("http://www.example.com"),
|
||||
base_href=None,
|
||||
notify_js_module=no_js_notify,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
"a = this;",
|
||||
"return this.location",
|
||||
'func(Function("return this"));',
|
||||
"'a||this||that",
|
||||
"(a,b,Q.contains(i[t], this))",
|
||||
"a = this.location.href; exports.Foo = Foo; /* export className */",
|
||||
]
|
||||
)
|
||||
def rewrite_this_js_content(request):
|
||||
content = request.param
|
||||
yield ContentForTests(
|
||||
content,
|
||||
content.replace("this", "_____WB$wombat$check$this$function_____(this)"),
|
||||
)
|
||||
|
||||
|
||||
def test_this_js_rewrite(simple_js_rewriter: JsRewriter, rewrite_this_js_content):
|
||||
assert (
|
||||
simple_js_rewriter.rewrite(rewrite_this_js_content.input_str)
|
||||
== rewrite_this_js_content.expected_str
|
||||
)
|
||||
|
||||
|
||||
class WrappedTestContent(ContentForTests):
|
||||
@staticmethod
|
||||
def wrap_script(text: str) -> str:
|
||||
"""
|
||||
A small wrapper to help generate the expected content.
|
||||
|
||||
JsRewriter must add this local definition around all js code (when we access on
|
||||
of the local varibles)
|
||||
"""
|
||||
return (
|
||||
"var _____WB$wombat$assign$function_____ = function(name) {return (self."
|
||||
"_wb_wombat && self._wb_wombat.local_init && self._wb_wombat.local_init"
|
||||
"(name)) || self[name]; };\n"
|
||||
"if (!self.__WB_pmw) { self.__WB_pmw = function(obj) { this.__WB_source ="
|
||||
" obj; return this; } }\n"
|
||||
"{\n"
|
||||
'let window = _____WB$wombat$assign$function_____("window");\n'
|
||||
'let globalThis = _____WB$wombat$assign$function_____("globalThis");\n'
|
||||
'let self = _____WB$wombat$assign$function_____("self");\n'
|
||||
'let document = _____WB$wombat$assign$function_____("document");\n'
|
||||
'let location = _____WB$wombat$assign$function_____("location");\n'
|
||||
'let top = _____WB$wombat$assign$function_____("top");\n'
|
||||
'let parent = _____WB$wombat$assign$function_____("parent");\n'
|
||||
'let frames = _____WB$wombat$assign$function_____("frames");\n'
|
||||
'let opener = _____WB$wombat$assign$function_____("opener");\n'
|
||||
"let arguments;\n"
|
||||
"\n"
|
||||
f"{text}"
|
||||
"\n"
|
||||
"}"
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
self.expected = self.wrap_script(self.expected_str)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
WrappedTestContent(
|
||||
"location = http://example.com/",
|
||||
"location = ((self.__WB_check_loc && self.__WB_check_loc(location, argument"
|
||||
"s)) || {}).href = http://example.com/",
|
||||
),
|
||||
WrappedTestContent(
|
||||
" location = http://example.com/2",
|
||||
" location = ((self.__WB_check_loc && self.__WB_check_loc(location, argumen"
|
||||
"ts)) || {}).href = http://example.com/2",
|
||||
),
|
||||
WrappedTestContent("func(location = 0)", "func(location = 0)"),
|
||||
WrappedTestContent(
|
||||
" location = http://example.com/2",
|
||||
" location = ((self.__WB_check_loc && self.__WB_check_loc(location, argumen"
|
||||
"ts)) || {}).href = http://example.com/2",
|
||||
),
|
||||
WrappedTestContent("window.eval(a)", "window.eval(a)"),
|
||||
WrappedTestContent("x = window.eval; x(a);", "x = window.eval; x(a);"),
|
||||
WrappedTestContent(
|
||||
"this. location = 'http://example.com/'",
|
||||
"this. location = 'http://example.com/'",
|
||||
),
|
||||
WrappedTestContent(
|
||||
"if (self.foo) { console.log('blah') }",
|
||||
"if (self.foo) { console.log('blah') }",
|
||||
),
|
||||
WrappedTestContent("window.x = 5", "window.x = 5"),
|
||||
]
|
||||
)
|
||||
def rewrite_wrapped_content(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_wrapped_rewrite(simple_js_rewriter: JsRewriter, rewrite_wrapped_content):
|
||||
assert (
|
||||
simple_js_rewriter.rewrite(rewrite_wrapped_content.input_str)
|
||||
== rewrite_wrapped_content.expected_str
|
||||
)
|
||||
|
||||
|
||||
class ImportTestContent(ContentForTests):
|
||||
@staticmethod
|
||||
# We want to import js stored in zim file as `_zim_static/__wb_module_decl.js` from
|
||||
# `https://exemple.com/some/path/` so path is
|
||||
# `../../../_zim_static/__wb_module_decl.js`
|
||||
def wrap_import(text: str) -> str:
|
||||
"""
|
||||
A small wrapper to help us generate the expected content for modules.
|
||||
|
||||
JsRewriter must add this import line at beginning of module codes (when code
|
||||
contains `import` or `export`)
|
||||
"""
|
||||
return (
|
||||
"import { window, globalThis, self, document, location, top, parent, "
|
||||
'frames, opener } from "../../../_zim_static/__wb_module_decl.js";\n'
|
||||
f"{text}"
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
self.article_url = "https://exemple.com/some/path/"
|
||||
self.expected = self.wrap_import(self.expected_str)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
# import rewrite
|
||||
ImportTestContent(
|
||||
"""import "foo";
|
||||
|
||||
a = this.location""",
|
||||
"""import "foo";
|
||||
|
||||
a = _____WB$wombat$check$this$function_____(this).location""",
|
||||
),
|
||||
# import/export module rewrite
|
||||
ImportTestContent(
|
||||
"""a = this.location
|
||||
|
||||
export { a };
|
||||
""",
|
||||
"""a = _____WB$wombat$check$this$function_____(this).location
|
||||
|
||||
export { a };
|
||||
""",
|
||||
),
|
||||
# rewrite ESM module import
|
||||
ImportTestContent(
|
||||
'import "https://example.com/file.js"',
|
||||
'import "../../../example.com/file.js"',
|
||||
),
|
||||
ImportTestContent(
|
||||
'''
|
||||
import {A, B}
|
||||
from
|
||||
"https://example.com/file.js"''',
|
||||
'''
|
||||
import {A, B}
|
||||
from
|
||||
"../../../example.com/file.js"''',
|
||||
),
|
||||
ImportTestContent(
|
||||
"""
|
||||
import * from "https://example.com/file.js"
|
||||
import A from "http://example.com/path/file2.js";
|
||||
|
||||
import {C, D} from "./abc.js";
|
||||
import {X, Y} from "../parent.js";
|
||||
import {E, F, G} from "/path.js";
|
||||
import { Z } from "../../../path.js";
|
||||
|
||||
B = await import(somefile);
|
||||
""",
|
||||
"""
|
||||
import * from "../../../example.com/file.js"
|
||||
import A from "../../../example.com/path/file2.js";
|
||||
|
||||
import {C, D} from "./abc.js";
|
||||
import {X, Y} from "../parent.js";
|
||||
import {E, F, G} from "../../path.js";
|
||||
import { Z } from "../../path.js";
|
||||
|
||||
B = await ____wb_rewrite_import__(import.meta.url, somefile);
|
||||
""",
|
||||
),
|
||||
ImportTestContent(
|
||||
'import"import.js";import{A, B, C} from"test.js";(function() => { frames[0]'
|
||||
'.href = "/abc"; })',
|
||||
'import"import.js";import{A, B, C} from"test.js";(function() => { frames[0]'
|
||||
'.href = "/abc"; })',
|
||||
),
|
||||
ImportTestContent(
|
||||
"""a = location
|
||||
|
||||
export{ a, $ as b};
|
||||
""",
|
||||
"""a = location
|
||||
|
||||
export{ a, $ as b};
|
||||
""",
|
||||
),
|
||||
]
|
||||
)
|
||||
def rewrite_import_content(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_import_rewrite(no_js_notify, rewrite_import_content):
|
||||
url_rewriter = ArticleUrlRewriter(
|
||||
HttpUrl(rewrite_import_content.article_url), set()
|
||||
)
|
||||
assert (
|
||||
JsRewriter(
|
||||
url_rewriter=url_rewriter, base_href=None, notify_js_module=no_js_notify
|
||||
).rewrite(rewrite_import_content.input_str, opts={"isModule": True})
|
||||
== rewrite_import_content.expected_str
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
"return this.abc",
|
||||
"return this object",
|
||||
"a = 'some, this object'",
|
||||
"{foo: bar, this: other}",
|
||||
"this.$location = http://example.com/",
|
||||
"this. $location = http://example.com/",
|
||||
"this. _location = http://example.com/",
|
||||
"this. alocation = http://example.com/",
|
||||
"this.location = http://example.com/",
|
||||
",eval(a)",
|
||||
"this.$eval(a)",
|
||||
"x = $eval; x(a);",
|
||||
"obj = { eval : 1 }",
|
||||
"x = obj.eval",
|
||||
"x = obj.eval(a)",
|
||||
"x = obj._eval(a)",
|
||||
"x = obj.$eval(a)",
|
||||
"if (a.self.foo) { console.log('blah') }",
|
||||
"a.window.x = 5",
|
||||
" postMessage({'a': 'b'})",
|
||||
"simport(5);",
|
||||
"a.import(5);",
|
||||
"$import(5);",
|
||||
"async import(val) { ... }",
|
||||
"""function blah() {
|
||||
const text = "text: import a from B.js";
|
||||
}
|
||||
""",
|
||||
"""function blah() {
|
||||
const text = `
|
||||
import a from "https://example.com/B.js"
|
||||
`;
|
||||
}
|
||||
|
||||
""",
|
||||
"let a = 7; var b = 5; const foo = 4;\n\n",
|
||||
]
|
||||
)
|
||||
def no_rewrite_js_content(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_no_rewrite(simple_js_rewriter: JsRewriter, no_rewrite_js_content):
|
||||
assert simple_js_rewriter.rewrite(no_rewrite_js_content) == no_rewrite_js_content
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"js_src,expected_js_module_path",
|
||||
[
|
||||
("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"),
|
||||
("../my-module-script.js", "kiwix.org/my-module-script.js"),
|
||||
("../../../my-module-script.js", "kiwix.org/my-module-script.js"),
|
||||
("/my-module-script.js", "kiwix.org/my-module-script.js"),
|
||||
("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"),
|
||||
(
|
||||
"https://myserver.com/my-module-script.js",
|
||||
"myserver.com/my-module-script.js",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_js_rewrite_nested_module_detected(js_src, expected_js_module_path):
|
||||
|
||||
js_modules = []
|
||||
|
||||
def custom_notify(zim_path: ZimPath):
|
||||
js_modules.append(zim_path)
|
||||
|
||||
url_rewriter = ArticleUrlRewriter(
|
||||
HttpUrl("http://kiwix.org/my_folder/my_article.html"), set()
|
||||
)
|
||||
|
||||
JsRewriter(
|
||||
url_rewriter=url_rewriter, base_href=None, notify_js_module=custom_notify
|
||||
).rewrite(f'import * from "{js_src}"', opts={"isModule": True})
|
||||
|
||||
assert len(js_modules) == 1
|
||||
assert js_modules[0].value == expected_js_module_path
|
|
@ -15,7 +15,7 @@ def test_title_validation(title, is_valid):
|
|||
if is_valid:
|
||||
assert main(args) == 100
|
||||
else:
|
||||
with pytest.raises(ValueError, match="Title is too long"):
|
||||
with pytest.raises(ValueError, match="Title value is too long"):
|
||||
main(args)
|
||||
|
||||
|
||||
|
@ -35,7 +35,7 @@ def test_description_validation(description, is_valid):
|
|||
if is_valid:
|
||||
assert main(args) == 100
|
||||
else:
|
||||
with pytest.raises(ValueError, match="Description is too long"):
|
||||
with pytest.raises(ValueError, match="Description value is too long"):
|
||||
main(args)
|
||||
|
||||
|
||||
|
@ -62,7 +62,7 @@ def test_long_description_validation(long_description, is_valid):
|
|||
if is_valid:
|
||||
assert main(args) == 100
|
||||
else:
|
||||
with pytest.raises(ValueError, match="Description is too long"):
|
||||
with pytest.raises(ValueError, match="LongDescription value is too long"):
|
||||
main(args)
|
||||
|
||||
|
||||
|
|
|
@ -4,9 +4,9 @@ import pytest
|
|||
from jinja2 import Template
|
||||
from warcio import StatusAndHeaders
|
||||
from warcio.recordloader import ArcWarcRecord
|
||||
from zimscraperlib.rewriting.url_rewriting import ZimPath
|
||||
|
||||
from warc2zim.content_rewriting.generic import Rewriter
|
||||
from warc2zim.url_rewriting import ZimPath
|
||||
from warc2zim.rewriting import Rewriter
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
|
|
@ -1,538 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"article_url, original_content_url, expected_rewriten_content_url, know_paths, "
|
||||
"rewrite_all_url",
|
||||
[
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo.html",
|
||||
"foo.html",
|
||||
["kiwix.org/a/article/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo.html#anchor1",
|
||||
"foo.html#anchor1",
|
||||
["kiwix.org/a/article/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo.html?foo=bar",
|
||||
"foo.html%3Ffoo%3Dbar",
|
||||
["kiwix.org/a/article/foo.html?foo=bar"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo.html?foo=b%24ar",
|
||||
"foo.html%3Ffoo%3Db%24ar",
|
||||
["kiwix.org/a/article/foo.html?foo=b$ar"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo.html?foo=b%3Far", # a query string with an encoded ? char in value
|
||||
"foo.html%3Ffoo%3Db%3Far",
|
||||
["kiwix.org/a/article/foo.html?foo=b?ar"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"fo%o.html",
|
||||
"fo%25o.html",
|
||||
["kiwix.org/a/article/fo%o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foé.html", # URL not matching RFC 3986 (many HTML documents are invalid)
|
||||
"fo%C3%A9.html", # character is encoded so that URL match RFC 3986
|
||||
["kiwix.org/a/article/foé.html"], # but ZIM path is non-encoded
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"./foo.html",
|
||||
"foo.html",
|
||||
["kiwix.org/a/article/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"../foo.html",
|
||||
"https://kiwix.org/a/foo.html", # Full URL since not in known URLs
|
||||
["kiwix.org/a/article/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"../foo.html",
|
||||
"../foo.html", # all URLs rewrite activated
|
||||
["kiwix.org/a/article/foo.html"],
|
||||
True,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"../foo.html",
|
||||
"../foo.html",
|
||||
["kiwix.org/a/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"../bar/foo.html",
|
||||
"https://kiwix.org/a/bar/foo.html", # Full URL since not in known URLs
|
||||
["kiwix.org/a/article/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"../bar/foo.html",
|
||||
"../bar/foo.html", # all URLs rewrite activated
|
||||
["kiwix.org/a/article/foo.html"],
|
||||
True,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"../bar/foo.html",
|
||||
"../bar/foo.html",
|
||||
["kiwix.org/a/bar/foo.html"],
|
||||
False,
|
||||
),
|
||||
( # we cannot go upper than host, so '../' in excess are removed
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"../../../../../foo.html",
|
||||
"../../foo.html",
|
||||
["kiwix.org/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo?param=value",
|
||||
"foo%3Fparam%3Dvalue",
|
||||
["kiwix.org/a/article/foo?param=value"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo?param=value%2F",
|
||||
"foo%3Fparam%3Dvalue/",
|
||||
["kiwix.org/a/article/foo?param=value/"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo?param=value%2Fend",
|
||||
"foo%3Fparam%3Dvalue/end",
|
||||
["kiwix.org/a/article/foo?param=value/end"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"foo/",
|
||||
"foo/",
|
||||
["kiwix.org/a/article/foo/"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo o.html",
|
||||
"../../fo%20o.html",
|
||||
["kiwix.org/fo o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo+o.html",
|
||||
"../../fo%2Bo.html",
|
||||
["kiwix.org/fo+o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo%2Bo.html",
|
||||
"../../fo%2Bo.html",
|
||||
["kiwix.org/fo+o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/foo.html?param=val+ue",
|
||||
"../../foo.html%3Fparam%3Dval%20ue",
|
||||
["kiwix.org/foo.html?param=val ue"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo~o.html",
|
||||
"../../fo~o.html",
|
||||
["kiwix.org/fo~o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo-o.html",
|
||||
"../../fo-o.html",
|
||||
["kiwix.org/fo-o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo_o.html",
|
||||
"../../fo_o.html",
|
||||
["kiwix.org/fo_o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo%7Eo.html", # must not be encoded / must be decoded (RFC 3986 #2.3)
|
||||
"../../fo~o.html",
|
||||
["kiwix.org/fo~o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo%2Do.html", # must not be encoded / must be decoded (RFC 3986 #2.3)
|
||||
"../../fo-o.html",
|
||||
["kiwix.org/fo-o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/fo%5Fo.html", # must not be encoded / must be decoded (RFC 3986 #2.3)
|
||||
"../../fo_o.html",
|
||||
["kiwix.org/fo_o.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/foo%2Ehtml", # must not be encoded / must be decoded (RFC 3986 #2.3)
|
||||
"../../foo.html",
|
||||
["kiwix.org/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"#anchor1",
|
||||
"#anchor1",
|
||||
["kiwix.org/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/",
|
||||
"#anchor1",
|
||||
"#anchor1",
|
||||
["kiwix.org/a/article/"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/",
|
||||
"../article/",
|
||||
"./",
|
||||
["kiwix.org/a/article/"],
|
||||
False,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_relative_url(
|
||||
article_url,
|
||||
know_paths,
|
||||
original_content_url,
|
||||
expected_rewriten_content_url,
|
||||
rewrite_all_url,
|
||||
):
|
||||
article_url = HttpUrl(article_url)
|
||||
rewriter = ArticleUrlRewriter(
|
||||
article_url,
|
||||
{ZimPath(path) for path in know_paths},
|
||||
)
|
||||
assert (
|
||||
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
|
||||
== expected_rewriten_content_url
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"article_url, original_content_url, expected_rewriten_content_url, know_paths, "
|
||||
"rewrite_all_url",
|
||||
[
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/foo.html",
|
||||
"../../foo.html",
|
||||
["kiwix.org/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/bar.html",
|
||||
"https://kiwix.org/bar.html", # Full URL since not in known URLs
|
||||
["kiwix.org/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"/bar.html",
|
||||
"../../bar.html", # all URLs rewrite activated
|
||||
["kiwix.org/foo.html"],
|
||||
True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_absolute_path_url(
|
||||
article_url,
|
||||
know_paths,
|
||||
original_content_url,
|
||||
expected_rewriten_content_url,
|
||||
rewrite_all_url,
|
||||
):
|
||||
article_url = HttpUrl(article_url)
|
||||
rewriter = ArticleUrlRewriter(
|
||||
article_url,
|
||||
{ZimPath(path) for path in know_paths},
|
||||
)
|
||||
assert (
|
||||
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
|
||||
== expected_rewriten_content_url
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"article_url, original_content_url, expected_rewriten_content_url, know_paths, "
|
||||
"rewrite_all_url",
|
||||
[
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"//kiwix.org/foo.html",
|
||||
"../../foo.html",
|
||||
["kiwix.org/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"//kiwix.org/bar.html",
|
||||
"https://kiwix.org/bar.html", # Full URL since not in known URLs
|
||||
["kiwix.org/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"//kiwix.org/bar.html",
|
||||
"../../bar.html", # all URLs rewrite activated
|
||||
["kiwix.org/foo.html"],
|
||||
True,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"//acme.com/foo.html",
|
||||
"../../../acme.com/foo.html",
|
||||
["acme.com/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"http://kiwix.org/a/article/document.html",
|
||||
"//acme.com/bar.html",
|
||||
"http://acme.com/bar.html", # Full URL since not in known URLs
|
||||
["kiwix.org/foo.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"//acme.com/bar.html",
|
||||
"../../../acme.com/bar.html", # all URLs rewrite activated
|
||||
["kiwix.org/foo.html"],
|
||||
True,
|
||||
),
|
||||
( # puny-encoded host is transformed into url-encoded value
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"//xn--exmple-cva.com/a/article/document.html",
|
||||
"../../../ex%C3%A9mple.com/a/article/document.html",
|
||||
["exémple.com/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
( # host who should be puny-encoded ir transformed into url-encoded value
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"//exémple.com/a/article/document.html",
|
||||
"../../../ex%C3%A9mple.com/a/article/document.html",
|
||||
["exémple.com/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_absolute_scheme_url(
|
||||
article_url,
|
||||
know_paths,
|
||||
original_content_url,
|
||||
expected_rewriten_content_url,
|
||||
rewrite_all_url,
|
||||
):
|
||||
article_url = HttpUrl(article_url)
|
||||
rewriter = ArticleUrlRewriter(
|
||||
article_url,
|
||||
{ZimPath(path) for path in know_paths},
|
||||
)
|
||||
assert (
|
||||
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
|
||||
== expected_rewriten_content_url
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"article_url, original_content_url, expected_rewriten_content_url, know_paths, "
|
||||
"rewrite_all_url",
|
||||
[
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"https://foo.org/a/article/document.html",
|
||||
"../../../foo.org/a/article/document.html",
|
||||
["foo.org/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"http://foo.org/a/article/document.html",
|
||||
"../../../foo.org/a/article/document.html",
|
||||
["foo.org/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"http://kiwix.org/a/article/document.html",
|
||||
"https://foo.org/a/article/document.html",
|
||||
"../../../foo.org/a/article/document.html",
|
||||
["foo.org/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
(
|
||||
"http://kiwix.org/a/article/document.html",
|
||||
"https://user:password@foo.org:8080/a/article/document.html",
|
||||
"../../../foo.org/a/article/document.html",
|
||||
["foo.org/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
( # Full URL since not in known URLs
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"https://foo.org/a/article/document.html",
|
||||
"https://foo.org/a/article/document.html",
|
||||
["kiwix.org/a/article/foo/"],
|
||||
False,
|
||||
),
|
||||
( # all URLs rewrite activated
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"https://foo.org/a/article/document.html",
|
||||
"../../../foo.org/a/article/document.html",
|
||||
["kiwix.org/a/article/foo/"],
|
||||
True,
|
||||
),
|
||||
( # puny-encoded host is transformed into url-encoded value
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"https://xn--exmple-cva.com/a/article/document.html",
|
||||
"../../../ex%C3%A9mple.com/a/article/document.html",
|
||||
["exémple.com/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
( # host who should be puny-encoded is transformed into url-encoded value
|
||||
"https://kiwix.org/a/article/document.html",
|
||||
"https://exémple.com/a/article/document.html",
|
||||
"../../../ex%C3%A9mple.com/a/article/document.html",
|
||||
["exémple.com/a/article/document.html"],
|
||||
False,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_absolute_url(
|
||||
article_url,
|
||||
know_paths,
|
||||
original_content_url,
|
||||
expected_rewriten_content_url,
|
||||
rewrite_all_url,
|
||||
):
|
||||
article_url = HttpUrl(article_url)
|
||||
rewriter = ArticleUrlRewriter(
|
||||
article_url,
|
||||
{ZimPath(path) for path in know_paths},
|
||||
)
|
||||
assert (
|
||||
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
|
||||
== expected_rewriten_content_url
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"original_content_url, rewrite_all_url",
|
||||
[
|
||||
("data:0548datacontent", False),
|
||||
("blob:exemple.com/url", False),
|
||||
("mailto:bob@acme.com", False),
|
||||
("tel:+33.1.12.12.23", False),
|
||||
("data:0548datacontent", True),
|
||||
("blob:exemple.com/url", True),
|
||||
("mailto:bob@acme.com", True),
|
||||
("tel:+33.1.12.12.23", True),
|
||||
],
|
||||
)
|
||||
# other schemes are never rewritten, even when rewrite_all_url is true
|
||||
def test_no_rewrite_other_schemes(original_content_url, rewrite_all_url):
|
||||
article_url = HttpUrl("https://kiwix.org/a/article/document.html")
|
||||
rewriter = ArticleUrlRewriter(
|
||||
article_url,
|
||||
set(),
|
||||
)
|
||||
assert (
|
||||
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
|
||||
== original_content_url
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"original_content_url, know_path, base_href, expected_rewriten_content_url",
|
||||
[
|
||||
pytest.param(
|
||||
"foo.html",
|
||||
"kiwix.org/a/article/foo.html",
|
||||
None,
|
||||
"foo.html",
|
||||
id="no_base",
|
||||
),
|
||||
pytest.param(
|
||||
"foo.html",
|
||||
"kiwix.org/a/foo.html",
|
||||
"../",
|
||||
"../foo.html",
|
||||
id="parent_base",
|
||||
),
|
||||
pytest.param(
|
||||
"foo.html",
|
||||
"kiwix.org/a/bar/foo.html",
|
||||
"../bar/",
|
||||
"../bar/foo.html",
|
||||
id="base_in_another_folder",
|
||||
),
|
||||
pytest.param(
|
||||
"foo.html",
|
||||
"www.example.com/foo.html",
|
||||
"https://www.example.com/",
|
||||
"../../../www.example.com/foo.html",
|
||||
id="base_on_absolute_url",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_base_href(
|
||||
original_content_url,
|
||||
know_path,
|
||||
base_href,
|
||||
expected_rewriten_content_url,
|
||||
):
|
||||
rewriter = ArticleUrlRewriter(
|
||||
HttpUrl("https://kiwix.org/a/article/document.html"),
|
||||
{ZimPath(path) for path in [know_path]},
|
||||
)
|
||||
assert (
|
||||
rewriter(original_content_url, base_href=base_href, rewrite_all_url=False)
|
||||
== expected_rewriten_content_url
|
||||
)
|
|
@ -17,7 +17,6 @@ from zimscraperlib.zim import Archive
|
|||
from warc2zim.__about__ import __version__
|
||||
from warc2zim.converter import iter_warc_records
|
||||
from warc2zim.main import main
|
||||
from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
|
||||
from warc2zim.utils import get_record_url
|
||||
|
||||
ZIM_ILLUSTRATION_SIZE = 48
|
||||
|
@ -242,101 +241,6 @@ class TestWarc2Zim:
|
|||
)
|
||||
return dst.getvalue()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url,zim_path",
|
||||
[
|
||||
("https://exemple.com", "exemple.com/"),
|
||||
("https://exemple.com/", "exemple.com/"),
|
||||
("http://example.com/resource", "example.com/resource"),
|
||||
("http://example.com/resource/", "example.com/resource/"),
|
||||
(
|
||||
"http://example.com/resource/folder/sub.txt",
|
||||
"example.com/resource/folder/sub.txt",
|
||||
),
|
||||
(
|
||||
"http://example.com/resource/folder/sub",
|
||||
"example.com/resource/folder/sub",
|
||||
),
|
||||
(
|
||||
"http://example.com/resource/folder/sub?foo=bar",
|
||||
"example.com/resource/folder/sub?foo=bar",
|
||||
),
|
||||
(
|
||||
"http://example.com/resource/folder/sub?foo=bar#anchor1",
|
||||
"example.com/resource/folder/sub?foo=bar",
|
||||
),
|
||||
("http://example.com/resource/#anchor1", "example.com/resource/"),
|
||||
("http://example.com/resource/?foo=bar", "example.com/resource/?foo=bar"),
|
||||
("http://example.com#anchor1", "example.com/"),
|
||||
("http://example.com?foo=bar#anchor1", "example.com/?foo=bar"),
|
||||
("http://example.com/?foo=bar", "example.com/?foo=bar"),
|
||||
("http://example.com/?foo=ba+r", "example.com/?foo=ba r"),
|
||||
(
|
||||
"http://example.com/?foo=ba r",
|
||||
"example.com/?foo=ba r",
|
||||
), # situation where the ` ` has not been properly escaped in document
|
||||
("http://example.com/?foo=ba%2Br", "example.com/?foo=ba+r"),
|
||||
("http://example.com/?foo=ba+%2B+r", "example.com/?foo=ba + r"),
|
||||
("http://example.com/#anchor1", "example.com/"),
|
||||
(
|
||||
"http://example.com/some/path/http://example.com//some/path",
|
||||
"example.com/some/path/http:/example.com/some/path",
|
||||
),
|
||||
(
|
||||
"http://example.com/some/pa?th/http://example.com//some/path",
|
||||
"example.com/some/pa?th/http:/example.com/some/path",
|
||||
),
|
||||
(
|
||||
"http://example.com/so?me/pa?th/http://example.com//some/path",
|
||||
"example.com/so?me/pa?th/http:/example.com/some/path",
|
||||
),
|
||||
("http://example.com/resource?", "example.com/resource"),
|
||||
("http://example.com/resource#", "example.com/resource"),
|
||||
("http://user@example.com/resource", "example.com/resource"),
|
||||
("http://user:password@example.com/resource", "example.com/resource"),
|
||||
("http://example.com:8080/resource", "example.com/resource"),
|
||||
(
|
||||
"http://foobargooglevideo.com/videoplayback?id=1576&key=value",
|
||||
"youtube.fuzzy.replayweb.page/videoplayback?id=1576",
|
||||
), # Fuzzy rule is applied in addition to path transformations
|
||||
("https://xn--exmple-cva.com", "exémple.com/"),
|
||||
("https://xn--exmple-cva.com/", "exémple.com/"),
|
||||
("https://xn--exmple-cva.com/resource", "exémple.com/resource"),
|
||||
("https://exémple.com/", "exémple.com/"),
|
||||
("https://exémple.com/resource", "exémple.com/resource"),
|
||||
# host_ip is an invalid hostname according to spec
|
||||
("https://host_ip/", "host_ip/"),
|
||||
("https://host_ip/resource", "host_ip/resource"),
|
||||
("https://192.168.1.1/", "192.168.1.1/"),
|
||||
("https://192.168.1.1/resource", "192.168.1.1/resource"),
|
||||
("http://example.com/res%24urce", "example.com/res$urce"),
|
||||
(
|
||||
"http://example.com/resource?foo=b%24r",
|
||||
"example.com/resource?foo=b$r",
|
||||
),
|
||||
("http://example.com/resource@300x", "example.com/resource@300x"),
|
||||
("http://example.com:8080/resource", "example.com/resource"),
|
||||
("http://user@example.com:8080/resource", "example.com/resource"),
|
||||
("http://user:password@example.com:8080/resource", "example.com/resource"),
|
||||
# the two URI below are an illustration of a potential collision (two
|
||||
# differents URI leading to the same ZIM path)
|
||||
(
|
||||
"http://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-"
|
||||
"de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png",
|
||||
"tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-"
|
||||
"Solidarité-Numérique_1@300x.png",
|
||||
),
|
||||
(
|
||||
"https://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-"
|
||||
"de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png",
|
||||
"tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-"
|
||||
"Solidarité-Numérique_1@300x.png",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_normalize(self, url, zim_path):
|
||||
assert normalize(HttpUrl(url)).value == ZimPath(zim_path).value
|
||||
|
||||
def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
|
||||
zim_output = "zim-out-filename.zim"
|
||||
main(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue