Adapt to zimscraperlib 5.0.0 - including all rewriting logic moved there - and upgrade other dependencies

This commit is contained in:
benoit74 2025-01-07 13:12:33 +00:00
parent 5040eeeffb
commit 1218df0560
No known key found for this signature in database
GPG key ID: B89606434FC7B530
46 changed files with 127 additions and 8886 deletions

View file

@ -6,7 +6,7 @@ on:
jobs:
publish:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
permissions:
id-token: write # mandatory for PyPI trusted publishing
@ -24,17 +24,6 @@ jobs:
pip install -U pip
pip install -e .[scripts]
- name: Generate fuzzy rules
run: python rules/generate_rules.py
- name: Build Javascript wombatSetup.js
uses: addnab/docker-run-action@v3
with:
image: node:20-bookworm
options: -v ${{ github.workspace }}/src/warc2zim/statics:/output -v ${{ github.workspace }}/rules:/src/rules -v ${{ github.workspace }}/javascript:/src/javascript -v ${{ github.workspace }}/build_js.sh:/src/build_js.sh
run: |
/src/build_js.sh
- name: Build packages
run: |
pip install -U pip build

View file

@ -7,19 +7,11 @@ on:
jobs:
publish:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Build Javascript wombatSetup.js
uses: addnab/docker-run-action@v3
with:
image: node:20-bookworm
options: -v ${{ github.workspace }}/src/warc2zim/statics:/output -v ${{ github.workspace }}/rules:/src/rules -v ${{ github.workspace }}/javascript:/src/javascript -v ${{ github.workspace }}/build_js.sh:/src/build_js.sh
run: |
/src/build_js.sh
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:

View file

@ -8,7 +8,7 @@ on:
jobs:
check-qa:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
@ -24,9 +24,6 @@ jobs:
pip install -U pip
pip install -e .[lint,scripts,test,check]
- name: Generate fuzzy rules
run: python rules/generate_rules.py
- name: Check black formatting
run: inv lint-black
@ -35,20 +32,3 @@ jobs:
- name: Check pyright
run: inv check-pyright
- name: Set up Node.JS
uses: actions/setup-node@v4
with:
node-version: 20
- name: Install JS dependencies
working-directory: javascript
run: yarn install
- name: Check prettier formatting
working-directory: javascript
run: yarn prettier-check
- name: Check eslint rules
working-directory: javascript
run: yarn eslint

View file

@ -7,7 +7,7 @@ on:
jobs:
publish:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4

View file

@ -8,7 +8,7 @@ on:
jobs:
run-tests:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
@ -24,9 +24,6 @@ jobs:
pip install -U pip
pip install -e .[test,scripts]
- name: Generate fuzzy rules
run: python rules/generate_rules.py
- name: Run the tests
run: inv coverage --args "-vvv"
@ -35,21 +32,8 @@ jobs:
with:
token: ${{ secrets.CODECOV_TOKEN }}
- name: Set up Node.JS
uses: actions/setup-node@v4
with:
node-version: 20
- name: Install JS dependencies
working-directory: javascript
run: yarn install
- name: Run JS tests
working-directory: javascript
run: yarn test
build_python:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
@ -59,21 +43,13 @@ jobs:
python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
run: |
pip install -U pip build
pip install -e .[scripts]
- name: Generate fuzzy rules
run: python rules/generate_rules.py
- name: Ensure we can build Python targets
run: |
pip install -U pip build
python3 -m build --sdist --wheel
build_docker:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4

12
.gitignore vendored
View file

@ -495,18 +495,6 @@ pyrightconfig.json
# ignore all vscode, this is not standard configuration in this place
.vscode
# installed at build time
src/warc2zim/statics/wombat.js
# temporary directories used during development
output
tmp
# rule files are generated by rules/generate_rules.py
src/warc2zim/rules.py
tests/test_fuzzy_rules.py
javascript/src/fuzzyRules.js
javascript/test/fuzzyRules.js
# wombatSetup.js is generated with rollup
src/warc2zim/statics/wombatSetup.js

View file

@ -11,11 +11,11 @@ repos:
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.9
rev: v0.8.4
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.383
rev: v1.1.391
hooks:
- id: pyright
name: pyright (system)

View file

@ -9,7 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Upgrade to wombat 3.8.6 (#334)
- Upgrade dependencies: zimscraperlib 5.0.0, warcio 1.7.5, cdxj_index 1.4.6 and others
- Use all rewriting stuff from zimscraperlib
- Remove most HTML / CSS / JS rewriting logic which is now part of zimscraperlib 5
- Fix wombat setup settings (especially `isSW`) (#293)
### Fixed

View file

@ -1,5 +1,5 @@
FROM python:3.12-slim-bookworm
LABEL org.opencontainers.image.source https://github.com/openzim/warc2zim
LABEL org.opencontainers.image.source=https://github.com/openzim/warc2zim
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
@ -12,15 +12,13 @@ RUN apt-get update -y \
WORKDIR /output
# Copy pyproject.toml and its dependencies
COPY pyproject.toml openzim.toml README.md /src/
COPY rules/generate_rules.py /src/rules/generate_rules.py
COPY pyproject.toml README.md /src/
COPY src/warc2zim/__about__.py /src/src/warc2zim/__about__.py
# Install Python dependencies
RUN pip install --no-cache-dir /src
# Copy code + associated artifacts
COPY rules /src/rules
COPY src /src/src
COPY *.md /src/

View file

@ -168,26 +168,13 @@ Start a hatch shell: this will install software including dependencies in an iso
hatch shell
```
### Regenerate wombatSetup.js
### Rewriting logic and rewriting rules
wombatSetup.js is the JS code used to setup wombat when the ZIM is used.
Mostly all rewriting logic and rewriting rules now comes from the [python-scraperlib](https://github.com/openzim/python-scraperlib/).
It is normally retrieved by Python build process (see openzim.toml for details).
Should you need to add more rules or modify rewriting logic, this is the place to go.
Recommended solution to develop this JS code is to install Node.JS on your system, and then
```bash
cd javascript
yarn build-dev # or yarn build-prod
```
Should you want to regenerate this code without install Node.JS, you might simply run following command.
```bash
docker run -v $PWD/src/warc2zim/statics:/output -v $PWD/rules:/src/rules -v $PWD/javascript:/src/javascript -v $PWD/build_js.sh:/src/build_js.sh -it --rm --entrypoint /src/build_js.sh node:20-bookworm
```
It will install Python3 on-top of Node.JS in a Docker container, generate JS fuzzy rules and bundle JS code straight to `/src/warc2zim/statics/wombatSetup.js` where the file is expected to be placed.
All resulting code (Python and Javascript) as well as wombat.js and wombat-setup.js comes from the python-scraperlib.
## License

View file

@ -1,26 +0,0 @@
#!/bin/bash
# Custom script to install Python on top of a Docker Node-JS image, then install
# required Python deps, generate fuzzy rules, and finally bundle JS script
apt-get update -y
apt-get install -y --no-install-recommends \
python3 python3-pip python3-venv
rm -rf /var/lib/apt/lists/*
python3 -m venv /local
/local/bin/python -m pip install --no-cache-dir -U \
pip \
jinja2==3.1.4 \
PyYAML==6.0.2
/local/bin/python /src/rules/generate_rules.py
cd /src/javascript
yarn install
OUTPUT_DIR=/output yarn build-prod

View file

@ -1,84 +0,0 @@
# Functional architecture
## Foreword
At a high level, warc2zim is a piece of software capable to transform a set of WARC files into one ZIM file. From a functional point of view, it is hence a "format converter".
While warc2zim is typically used as a sub-component of zimit, where WARC files are produced by Browsertrix crawler, it is in fact agnostic of this fact and could process any WARC file adhering to the standard.
This documentation will describe the big functions achieved by warc2zim codebase. It is important to note that these functions are not seggregated inside the codebase with frontiers.
## ZIM storage
While storing the web resources in the ZIM is mostly straightforward (we just transfer the raw bytes, after some modification for URL rewriting if needed), the decision of the path where the resource will be stored is very important.
This is purely conventional, even if ZIM specification has to be respected for proper operation in readers.
This function is responsible to compute the ZIM path where a given web resource is going to be stored.
While the URL is the only driver of this computation for now, warc2zim might have to consider other contextual data in the future. E.g. the resource to serve might by dynamic, depending not only on URL query parameters but also header(s) value(s).
## Fuzzy rules
Unfortunately, it is not always possible / desirable to store the resource with a simple transformation.
A typical situation is that some query parameters are dynamically computed by some Javascript code to include user tracking identifier, current datetime information, ...
When running again the same javascript code inside the ZIM, the URL will hence be slightly different because context has changed, but the same content needs to be retrieved.
warc2zim hence relies on fuzzy rules to transform/simplify some URLs when computing the ZIM path.
## URL Rewriting
warc2zim transforms (rewrites) URLs found in documents (HTML, CSS, JS, ...) so that they are usable inside the ZIM.
### General case
One simple example is that we might have following code in an HTML document to load an image with an absolute URL:
```
<img src="https://en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg"></img>
```
The URL `https://en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` has to be transformed to a URL that it is usable inside the ZIM.
For proper reader operation, openZIM prohibits using absolute URLs, so this has to be a relative URL. This relative URL is hence dependant on the location of the resource currently being rewriten.
The table below gives some examples of what the rewritten URL is going to be, depending on the URL of the rewritten document.
| HTML document URL | image URL rewritten for usage inside the ZIM |
|--|--|
| `https://en.wikipedia.org/wiki/Kiwix` | `./File:Kiwix_logo_v3.svg` |
| `https://en.wikipedia.org/wiki` | `./wiki/File:Kiwix_logo_v3.svg` |
| `https://en.wikipedia.org/waka/Kiwix` | `../wiki/File:Kiwix_logo_v3.svg` |
| `https://fr.wikipedia.org/wiki/Kiwix` | `../../en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` |
As can be seen on the last line (but this is true for all URLs), this rewriting has to take into account the convention saying at which ZIM path a given web resource will be stored.
### Dynamic case
The explanation above more or less assumed that the transformations can be done statically, i.e warc2zim can open every known document, find existing URLs and replace them with their counterpart inside the ZIM.
While this is possible for HTML and CSS documents typically, it is not possible when the URL is dynamically computed. This is typically the case for JS documents, where in the general case the URL is not statically stored inside the JS code but computed on-the-fly by aggregating various strings and values.
Rewriting these computations is not deemed feasible due to the huge variety of situation which might be encountered.
A specific function is hence needed to rewrite URL **live in client browser**, intercept any function triggering a web request, transform the URL according to conventions (where we expect the resource to be located in the general case) and fuzzy rules.
_Spoiler: this is where we will rely on wombat.js from webrecorder team, since this dynamic interception is quite complex and already done quite neatly by them_
### Fuzzy rules
The same fuzzy rules that have been used to compute the ZIM path from a resource URL have to be applied again when rewriting URLs.
While this is expected to serve mostly for the dynamic case, we still applies them on both side (staticaly and dynamicaly) for coherency.
## Documents rewriten statically
For now warc2zim rewrites HTML, CSS and JS documents. For CSS and JS, this mainly consists in replacing URLs. For HTML, we also have more specific rewritting necessary (e.g. to handle base href or redirects with meta).
Since 2.1, no domain specific (DS) rules are applied like it is done in wabac.JS because these rules are already applied in Browsertrix Crawler. For the same reason, JSON is not rewritten anymore (URL do not need to be rewritten in JSON because these URLs will be used by JS, intercepted by wombat and dynamically rewritten).
JSONP callbacks are supposed to be rewritten but this has not been heavily tested.
Other types of documents are supposed to be either not feasible / not worth it (e.g. URLs inside PDF documents), meaningless (e.g. images, fonts) or planned for later due to limited usage in the wild (e.g. XML).

View file

@ -1,48 +0,0 @@
# Software architecture
## HTML rewriting
HTML rewriting is purely static (i.e. before resources are written to the ZIM). HTML code is parsed with the [HTML parser from Python standard library](https://docs.python.org/3/library/html.parser.html).
A small header script is inserted in HTML code to initialize wombat.js which will wrap all JS APIs to dynamically rewrite URLs comming from JS.
This header script is generated using [Jinja2](https://pypi.org/project/Jinja2/) template since it needs to populate some JS context variables needed by wombat.js operations (original scheme, original url, ...).
## CSS rewriting
CSS rewriting is purely static (i.e. before resources are written to the ZIM). CSS code is parsed with the [tinycss2 Python library](https://pypi.org/project/tinycss2/).
## JS rewriting
### Static
Static JS rewriting is simply a matter of pure textual manipulation with regular expressions. No parsing is done at all.
### Dynamic
Dynamic JS rewriting is done with [wombat JS library](https://github.com/webrecorder/wombat). The same fuzzy rules that are used for static rewritting are injected into wombat configuration. Code to rewrite URLs is an adapted version of the code used to compute ZIM paths.
For wombat setup, including the URL rewriting part, we need to pass wombat configuration info. This code is developed in the `javascript` folder. For URL parsing, it relies on the [uri-js library](https://www.npmjs.com/package/uri-js). This javascript code is bundled into a single `wombatSetup.js` file with [rollup bundler](https://rollupjs.org), the same bundler used by webrecorder team to bundle wombat.
## cdxj_indexer and warcio
[cdxj_indexer Python library](https://pypi.org/project/cdxj-indexer/) is a thin wrapper over [warcio Python library](https://pypi.org/project/warcio/). It used to iterate all record in WARCs.
It provide two main features:
- Loop over several WARCs in a directory (A visit of a website may be stored in several WARCs in the same directory).
- Provide a buffered access to warcs content (and not a "stream" (fileio) only api) (but monkey patching returned WarcRecord.
Except that, scraper directly uses WarcRecord (returned by cdxj_indexer, implemented in warcio) to access metadata and such.
## zimscraperlib
[zimscraperlib Python library](https://pypi.org/project/zimscraperlib) is used for ZIM operations.
## requests
[requests Python library](https://pypi.org/project/requests/) is used to retrieve the custom CSS file when a URL is passed.
## brotlipy
[brotlipy Python library](https://pypi.org/project/brotlipy/) is used to access brotli content in WARC records (not part of warcio because it is an optional dependency).

View file

@ -1,100 +0,0 @@
# Technical architecture
## Fuzzy rules
Fuzzy rules are stored in `rules/rules.yaml`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code.
Should you update these fuzzy rules, you hence have to:
- regenerate Python and JS files by running `python rules/generateRules.py`
- bundle again Javascript `wombatSetup.js` (see below).
## Wombat configuration
Wombat configuration contains some static configuration and the dynamic URL rewriting, including fuzzy rules.
It is bundled by rollup with `cd javascript && yarn build-prod` and the result is pushed to proper scraper location for inclusion at build time.
Tests are available and run with `cd javascript && yarn test`.
## Scraper operations
### High level overview
The scraper behavior is done in two phases.
First the WARC records are iterated to compute the ZIM metadata (find main path, favicon, ...) and detect which ZIM paths are expected to be populated. This is mandatory to know when we will rewrite the documents if the URLs we will encounter leads to something which is internal (inside the ZIM) and should be rewriten or external and should be kept as-is.
Second, the WARC records are iterated to be transformed and appended inside the ZIM. ZIM records are appended to the ZIM on the fly.
In both phases, WARC records are iterated in natural order, i.e. as they have been retrieved online during the crawl.
### Transformation of URL into ZIM path
Transforming a URL into a ZIM path has to respect the ZIM specification: path must not be url-encoded (i.e. it must be decoded) and it must be stored as UTF-8.
WARC record stores the items URL inside a header named "WARC-Target-URI". The value inside this header is encoded, or more exactly it is "exactly what the browser sent at the HTTP level" (see https://github.com/webrecorder/browsertrix-crawler/issues/492 for more details).
It has been decided (by convention) that we will drop the scheme, the port, the username and password from the URL. Headers are also not considered in this computation.
Computation of the ZIM path is hence mostly straightforward:
- decode the hostname which is puny-encoded
- decode the path and query parameter which might be url-encoded
## Rewriting documents
Some documents (HTML, CSS, JS and JSON for now) needs to be rewritten, e.g. to rewrite URLs, adapt some code to the ZIM context, ...
The first important step when processing a WARC entry to add it as a ZIM entry is hence to properly detect which kind of document we are dealing with.
This is done in the `get_rewrite_mode` function of the `Rewriter` class. Before 2.0.1, scraper was relying only on mimetype as returned in `Content-Type` HTTP response.
Unfortunately, this caused problems where some server are returning wrong information is this header, e.g. Cloudflare seems to frequently return `text/html` for woff2 fonts ; this causes the scraper to fail, because it is impossible to know in advance that we should ignore these errors, we could have a real document which should be rewriten but is failing.
Since 2.0.1, we've enriched the logic by using the new WARC header `WARC-Resource-Type` which contains the type of resources "as perceived by the browser" (from https://chromedevtools.github.io/devtools-protocol/tot/Network/#type-ResourceType, see https://github.com/webrecorder/browsertrix-crawler/pull/481). Unfortunately this information is not sufficient because of some very generic value returned like `fetch` or `xhr`. Scraper stills need to mix this information with the mimetype. Ideally, we would have prefer to find a single source of truth not relying on something returned by the server, but it is not available for now (see https://github.com/openzim/warc2zim/issues/340 for a discussion on this topic).
### URL rewriting
In addition to the computation of the relative path from the current document URL to the URL to rewrite, URL rewriting also consists in computing the proper ZIM path (with same operation as above) and properly encoding it so that the resulting URL respects [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986). Some important stuff has to be noted in this encoding.
- since the original hostname is now part of the path, it will now be url-encoded
- since the `?` and following query parameters are also part of the path (we do not want readers to drop them like kiwix-serve would do), they are also url-encoded
Below is an example case of the rewrite operation on an image URL found in an HTML document.
- Document original URL: `https://kiwix.org/a/article/document.html`
- Document ZIM path: `kiwix.org/a/article/document.html`
- Image original URL: `//xn--exmple-cva.com/a/resource/image.png?foo=bar`
- Image rewritten URL: `../../../ex%C3%A9mple.com/a/resource/image.png%3Ffoo%3Dbar`
- Image ZIM Path: `exémple.com/a/resource/image.png?foo=bar`
### JS Rewriting
JS Rewriting is a bit special because rules to apply are different wether we are using "classic" Javascript or "module" Javascript.
Detection of Javascript modules starts at the HTML level where we have a `<script type="module" src="...">` tag. This tells us that file at src location is a Javascript module. From there we now that its subresources are also Javascript module.
Currently this detection is done on-the-fly, based on the fact that WARC items are processed in the same order that they have been fetched by the browser, and we hence do not need a multi-pass approach. Meaning that HTML will be processed first, then parent JS, then its dependencies, ... **This is a strong assumption**.
### Different kinds of WARC records
The WARC to ZIM conversion is performed by transforming WARC records into ZIM records.
For `response` records, the rewritten payload (only, without HTTP headers) is stored inside the ZIM.
If the payload is zero-length, the record is omitted to conform to ZIM specifications of not storing empty records.
For `request` and `resource` records, they are simply ignored. These records do not convey important information for now.
**TODO** better explain what `request` and `resource` records are and why they might point to a different URL.
For `revisit` records, a ZIM alias is created if the revisit points to a diferrent URL.
**TODO** better explain what `revisit` records are and why they might point to a different URL.
### Duplicate URIs
WARCs allow multiple records for the same URL, while ZIM does not. As a result, only the first encountered response or resource record is stored in the ZIM, and subsequent records are ignored.
For revisit records, they are only added as a ZIM alias if pointing to a different URL, and are processed after response records. A revisit record to the same URL will always be ignored.
All other WARC records are skipped.

View file

@ -1,2 +0,0 @@
src/fuzzyRules.js
test/fuzzyRules.js

View file

@ -1,3 +0,0 @@
{
"singleQuote": true
}

View file

@ -1,7 +0,0 @@
export default [
{
rules: {
'prefer-const': 'error',
},
},
];

View file

@ -1,43 +0,0 @@
{
"name": "@openzim/wombat-setup",
"type": "module",
"version": "2.1.3-dev0",
"license": "GPL-3.0-or-later",
"author": "openZIM",
"devDependencies": {
"@rollup/plugin-commonjs": "26.0.1",
"@rollup/plugin-node-resolve": "15.2.3",
"@rollup/plugin-terser": "0.4.4",
"ava": "^6.1.3",
"eslint": "9.9.1",
"eslint-config-prettier": "9.1.0",
"prettier": "3.3.3",
"rollup": "4.21.2"
},
"scripts": {
"prettier-check": "prettier . --check",
"prettier-fix": "prettier . --write",
"eslint": "eslint .",
"test": "ava --verbose",
"build-prod": "rollup -c rollup.config.js",
"build-dev": "DEV=1 rollup -c rollup.config.js",
"build-dev-watch": "DEV=1 rollup --watch -c rollup.config.js"
},
"prettier": {
"singleQuote": true
},
"ava": {
"concurrency": 1,
"verbose": true,
"serial": true,
"files": [
"test/*.js"
],
"sources": [
"src/**/*"
]
},
"dependencies": {
"uri-js": "^4.4.1"
}
}

View file

@ -1,43 +0,0 @@
import path from 'path';
import url from 'url';
import { nodeResolve } from '@rollup/plugin-node-resolve'; // used to bundle node_modules code
import commonjs from '@rollup/plugin-commonjs'; // used to bundle CommonJS node_modules
import terser from '@rollup/plugin-terser'; // used to minify JS code
const __filename = url.fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const outputDir =
process.env.OUTPUT_DIR || path.join(__dirname, '../src/warc2zim/statics');
const noStrict = {
renderChunk(code) {
return code.replace("'use strict';", '');
},
};
const watchOptions = {
exclude: 'node_modules/**',
chokidar: {
alwaysStat: true,
usePolling: true,
},
};
const plugins = [nodeResolve({ preferBuiltins: false }), commonjs(), noStrict];
if (!process.env.DEV) {
plugins.push(terser());
}
export default {
input: 'src/wombatSetup.js',
output: {
name: 'wombatSetup',
file: path.join(outputDir, 'wombatSetup.js'),
sourcemap: false,
format: 'iife',
exports: 'named',
},
watch: watchOptions,
plugins: plugins,
};

View file

@ -1,313 +0,0 @@
import { fuzzyRules } from './fuzzyRules.js';
import URI from 'uri-js';
export function applyFuzzyRules(path) {
// Apply fuzzy rules to simplify the ZIM path. First matching rule is applied and
// result is immediately returned
for (const rule of fuzzyRules) {
const new_path = path.replace(new RegExp(rule.match), rule.replace);
if (new_path != path) {
return new_path;
}
}
return path;
}
export function hasAlreadyBeenRewritten(
original_absolute_url,
orig_url,
uri,
url,
) {
// Detect (with a heuristic) that the path is most probably already rewritten and
// must be kept as-is. We just need to detect relative links (all statically rewritten
// links are relative) and contains a path including the hostname (which cannot be
// joined with the orig_url since if it includes the hostname, it means it is in
// another hostname than orig_url and will hence go one level too high in the path
// hierarchy, hence working only on ZIM paths / relative links).
// The heurisitic is:
// - the link must be relative and start by going at least one level up
// - the first non relative part of the path (i.e. not . or ..) looks like a hostname
// (i.e. it contains a dot)
// - the relative link, when merged with orig_url, is going exactly one "path level"
// too high in the hierarchy
if (typeof uri.scheme == 'undefined' && url.startsWith('../')) {
const urlParts = url.split('/');
const original_absolute_url1 = URI.resolve(
orig_url,
urlParts.slice(1).join('/'),
);
const original_absolute_url2 = URI.resolve(
orig_url,
urlParts.slice(2).join('/'),
);
// detect that relative link is going exactly one "path level" too high
if (
original_absolute_url1 == original_absolute_url &&
original_absolute_url2 != original_absolute_url
) {
const firstNonRelativePart = urlParts.find((urlPart) => urlPart !== '..');
// detect that first non relative part of the path looks like a hostname
if (firstNonRelativePart.indexOf('.') > -1) {
// if all 3 conditions are true, then we assume it has already been rewritten
return true;
}
}
}
// otherwise we don't know and assume it can be safely rewritten
return false;
}
function removeSubsequentSlashes(value) {
// Remove all successive occurrences of a slash `/` in a given string
// E.g `val//ue` or `val///ue` or `val////ue` (and so on) are transformed into `value`
return value.replace(/\/\/+/g, '/');
}
export function urlRewriteFunction(
current_url, // The current (real) url we are on, e.g. http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/index.html
orig_host, // The host of the original url, e.g. www.example.com
orig_scheme, // The scheme of the original url, e.g. https
orig_url, // The original url, e.g. https://www.example.com/index.html
prefix, // The (absolute) prefix to add to all our urls (from where we are served), e.g. http://library.kiwix.org/content/myzim_yyyy-mm/
url, // first argument passed by wombat.JS at each invocation, current url to rewrite, e.g. http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/image.png
useRel,
mod,
doc, // last argument passed by wombat.JS at each invocation
) {
if (!url) return url;
// Transform URL which might be an object (detected on Chromium browsers at least)
url = String(url);
// Special stuff which is not really a URI but exists in the wild
if (['#', '{', '*'].includes(url.substring(0, 1))) return url;
// If URI scheme is defined but not http or https, we have to not rewrite the URL
const uri = URI.parse(url);
if (
typeof uri.scheme !== 'undefined' &&
!['http', 'https'].includes(uri.scheme)
)
return url;
// If url starts with prefix, we need to remove this prefix before applying usual
// rewrite rules
if (url.startsWith(prefix)) {
url = uri.scheme + '://' + url.substring(prefix.length);
}
// This is a hack to detect improper URL encoding ; proper detection should be
// possible with chardet or other alternatives but did not worked so far ; we hence
// take benefit of the error below to detect improper URL encoding
// When improper URL encoding is detected, we try to encode URL as a best-effort;
// 'best-effort', because if some part of the URL is encoded and another part is not,
// this will fail ... but this is a weird edge case anyway
try {
decodeURIComponent(URI.parse(url).path);
} catch (e) {
url = encodeURI(url);
}
// Compute the absolute URI, just like the browser would have resolved it hopefully
// We need to use the original URL for that to properly detect the hostname when
// present ; current URL does not allow to do it easily
const original_absolute_url = URI.resolve(orig_url, url);
// Detect if url has probably already been rewritten and return as-is in such a case
if (hasAlreadyBeenRewritten(original_absolute_url, orig_url, uri, url)) {
return url;
}
// Detect (with a heuristic) that the path is most probably already rewritten and
// must be kept as-is. We just need to detect relative links (all statically rewritten
// links are relative) and contains a path including the hostname (which cannot be
// joined with the orig_url since if it includes the hostname, it means it is in
// another hostname than orig_url and will hence go one level too high in the path
// hierarchy, hence working only on ZIM paths / relative links).
// The heurisitic is:
// - the link must be relative and start by going at least one level up
// - the first non relative part of the path (i.e. not . or ..) looks like a hostname
// (i.e. it contains a dot)
// - the relative link, when merged with orig_url, is going exactly one "path level"
// too high in the hierarchy
if (typeof uri.scheme == 'undefined' && url.startsWith('../')) {
const urlParts = url.split('/');
const original_absolute_url1 = URI.resolve(
orig_url,
urlParts.slice(1).join('/'),
);
const original_absolute_url2 = URI.resolve(
orig_url,
urlParts.slice(2).join('/'),
);
// detect that relative link is going exactly one "path level" too high
if (
original_absolute_url1 == original_absolute_url &&
original_absolute_url2 != original_absolute_url
) {
const firstNonRelativePart = urlParts.find((urlPart) => urlPart !== '..');
// detect that first non relative part of the path looks like a hostname
if (firstNonRelativePart.indexOf('.') > -1) {
// if all 3 conditions are true, then we do not rewrite the link at all,
// otherwise we continue with normal rewritting
return url;
}
}
}
// We now have to transform this absolute URI into a normalized ZIM path entry
const absolute_url_parts = URI.parse(original_absolute_url);
// Let's first compute the decoded host
const serialized_host = URI.serialize(
URI.parse('http://' + absolute_url_parts.host), // fake URI to benefit from decoding
{ iri: true }, // decode potentially puny-encoded host
);
const decoded_host = serialized_host.substring(7, serialized_host.length - 1);
// And the decoded path, only exception is that an empty path must resolve to '/' path
// (our convention, just like in Python)
const decoded_path =
!absolute_url_parts.path || absolute_url_parts.path.length === 0
? '/'
: decodeURIComponent(absolute_url_parts.path);
// And the decoded query, only exception is that + sign must resolve to ' ' to avoid
// confusion (our convention, just like in Python)
const decoded_query =
!absolute_url_parts.query || absolute_url_parts.query.length === 0
? ''
: '?' + decodeURIComponent(absolute_url_parts.query).replaceAll('+', ' ');
// combine all decoded parts to get the ZIM path
const zimPath =
decoded_host + removeSubsequentSlashes(decoded_path + decoded_query);
// apply the fuzzy rules to the ZIM path
const fuzzifiedPath = applyFuzzyRules(zimPath);
// Reencode everything but '/' (we decode it afterwards for simplicity)
const finalUrl =
prefix + encodeURIComponent(fuzzifiedPath).replaceAll('%2F', '/');
console.debug(
'urlRewriten:\n\t- current_url: ' +
current_url +
'\n\t- orig_host: ' +
orig_host +
'\n\t- orig_scheme: ' +
orig_scheme +
'\n\t- orig_url: ' +
orig_url +
'\n\t- prefix: ' +
prefix +
'\n\t- url: ' +
url +
'\n\t- useRel: ' +
useRel +
'\n\t- mod: ' +
mod +
'\n\t- doc: ' +
doc +
'\n\t- finalUrl: ' +
finalUrl.toString() +
'\n\t',
);
return finalUrl;
}
export function getWombatInfo(
current_url, // The current (real) url we are on
orig_host, // The host of the original url
orig_scheme, // The scheme of the original url
orig_url, // The original url
prefix, // The (absolute) prefix to add to all our urls (from where we are served))
) {
return {
// The rewrite function used to rewrite our urls.
rewrite_function: (url, useRel, mod, doc) =>
urlRewriteFunction(
current_url,
orig_host,
orig_scheme,
orig_url,
prefix,
url,
useRel,
mod,
doc,
),
// Seems to be used only to send message to. We don't care ?
top_url: current_url,
// Seems to be used to generate url for blobUrl returned by SW.
// We don't care (?)
url: orig_url,
// Use to timestamp message send to top frame. Don't care
timestamp: '',
// Use to send message to top frame and in default rewrite url function. Don't care
request_ts: '',
// The url on which we are served.
prefix: prefix,
// The default mod to use.
mod: '',
// Use to detect if we are framed (and send message to top frame ?)
is_framed: false,
// ??
is_live: false,
// Never used ?
coll: '',
// Set wombat if is proxy mode (we are not)
proxy_magic: '',
// This is the prefix on which we have stored our static files (needed by wombat).
// Must not conflict with other url served.
// Will be used by wombat to not rewrite back the url
static_prefix: prefix + '_zim_static/',
wombat_ts: '',
// A delay in sec to apply to all js time (`Date.now()`, ...)
wombat_sec: 0,
// The scheme of the original url
wombat_scheme: orig_scheme,
// The host of the original url
wombat_host: orig_host,
// We are not running inside a service worker, wombat needs to know about it since
// some "magic" URLs like blobs are not available
isSW: false,
// Convert all post request to get request
convert_post_to_get: true,
// Not used, we are not replaying in a frame
target_frame: '___wb_replay_top_frame',
// Not used, we are not running in live mode
enable_auto_fetch: false,
// Extra options, not used
wombat_opts: {},
};
}
export default {
applyFuzzyRules: applyFuzzyRules,
urlRewriteFunction: urlRewriteFunction,
getWombatInfo: getWombatInfo,
};

View file

@ -1,42 +0,0 @@
import test from 'ava';
import utils from '../src/wombatSetup.js';
test.beforeEach((t) => {
t.context.prefix = 'http://library.kiwix.org/content/myzim_yyyy-mm/';
t.context.originalHost = 'www.example.com';
t.context.originalScheme = 'https';
});
test('nominalWbInfo', (t) => {
const path = 'path1/resource1.js';
const originalUrl =
t.context.originalScheme + '://' + t.context.originalHost + '/' + path;
const wmInfo = utils.getWombatInfo(
t.context.prefix + path,
t.context.originalHost,
t.context.originalScheme,
originalUrl,
t.context.prefix,
);
t.is(wmInfo.coll, '');
t.is(wmInfo.convert_post_to_get, true);
t.is(wmInfo.enable_auto_fetch, false);
t.is(wmInfo.isSW, false);
t.is(wmInfo.is_framed, false);
t.is(wmInfo.is_live, false);
t.is(wmInfo.mod, '');
t.is(wmInfo.prefix, t.context.prefix);
t.is(wmInfo.proxy_magic, '');
t.is(wmInfo.request_ts, '');
t.is(wmInfo.static_prefix, t.context.prefix + '_zim_static/');
t.is(wmInfo.target_frame, '___wb_replay_top_frame');
t.is(wmInfo.timestamp, '');
t.is(wmInfo.top_url, t.context.prefix + path);
t.is(wmInfo.url, originalUrl);
t.is(wmInfo.wombat_host, t.context.originalHost);
t.deepEqual(wmInfo.wombat_opts, {});
t.is(wmInfo.wombat_scheme, t.context.originalScheme);
t.is(wmInfo.wombat_sec, 0);
t.is(wmInfo.wombat_ts, '');
});

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,15 +0,0 @@
[files.assets.config]
target_dir="src/warc2zim/statics"
execute_after=[
"cd ../../../ && python rules/generate_rules.py", # generate Python (and JS) rules
]
[files.assets.actions."wombat.js"]
action="get_file"
source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.8.6/dist/wombat.js"
target_file="wombat.js"
[files.assets.actions."wombatSetup.js"] # fallback if this script has not been properly build (should happen only in dev)
action="get_file"
source="https://dev.kiwix.org/warc2zim/wombatSetup.js"
target_file="wombatSetup.js"

View file

@ -1,7 +1,5 @@
[build-system]
# jinja2 is required to generate JS and Python rules at build time
# PyYAML is used to parse fuzzy rules and generate Python/JS code
requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4", "PyYAML==6.0.2"]
requires = ["hatchling", "hatch-openzim==0.2.1"]
build-backend = "hatchling.build"
[project]
@ -10,15 +8,15 @@ requires-python = ">=3.12,<3.13"
description = "Convert WARC to ZIM"
readme = "README.md"
dependencies = [
"warcio==1.7.4",
"warcio==1.7.5",
"requests==2.32.3",
"zimscraperlib==4.0.0",
"zimscraperlib==5.0.0rc2",
"jinja2==3.1.4", # also update version in build-system above and in build_js.sh
# to support possible brotli content in warcs, must be added separately
"brotlipy==0.7.0",
"cdxj_indexer==1.4.5",
"tinycss2==1.3.0",
"beautifulsoup4==4.12.3", # used to parse base href
"cdxj_indexer==1.4.6",
"tinycss2==1.4.0",
"beautifulsoup4==4.12.3", # used to parse base href
"lxml==5.3.0", # used by beautifulsoup4 for parsing html
"python-dateutil==2.9.0.post0",
]
@ -32,27 +30,24 @@ additional-keywords = ["warc"]
name="Webrecorder Software"
email="info@webrecorder.net"
[tool.hatch.build.hooks.openzim-build]
[project.optional-dependencies]
scripts = [
"invoke==2.2.0",
"PyYAML==6.0.2", # used to parse fuzzy rules and generate Python/JS code ; also update version in build-system above and in build_js.sh
]
lint = [
"black==24.10.0",
"ruff==0.6.9",
"ruff==0.8.4",
]
check = [
"pyright==1.1.383",
"pyright==1.1.391",
]
test = [
"pytest==8.3.3",
"coverage==7.6.1",
"pytest==8.3.4",
"coverage==7.6.9",
]
dev = [
"pre-commit==4.0.0",
"debugpy==1.8.6",
"pre-commit==4.0.1",
"debugpy==1.8.11",
"warc2zim[scripts]",
"warc2zim[lint]",
"warc2zim[test]",
@ -72,10 +67,6 @@ exclude = [
[tool.hatch.build.targets.wheel]
packages = ["src/warc2zim"]
artifacts = [
"src/warc2zim/statics/**",
"src/warc2zim/rules.py",
]
[tool.hatch.envs.default]
features = ["dev"]
@ -209,7 +200,6 @@ ban-relative-imports = "all"
[tool.ruff.lint.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests/**/*" = ["PLR2004", "S101", "TID252"]
"rules/generate_rules.py" = ["T201"]
[tool.pytest.ini_options]
minversion = "7.3"

View file

@ -1,174 +0,0 @@
import re
import sys
from pathlib import Path
import yaml
from jinja2 import Environment
rules_src = Path(__file__).with_name("rules.yaml")
if not rules_src.exists():
# This skip is usefull mostly for CI operations when installing only Python deps
print("Skipping rules generation, rule file is missing")
sys.exit()
FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"]
for rule in FUZZY_RULES:
if "name" not in rule:
raise SystemExit("Fuzzy rule is missing a name")
if "tests" not in rule or len(rule["tests"]) == 0:
raise SystemExit("Fuzzy rule is missing test cases")
PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII)
# Do not escape anything, we want to generate code as-is, it won't be interpreted as
# HTML anyway
JINJA_ENV = Environment(autoescape=False) # noqa: S701
### Generate Javascript code
js_code_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
export const fuzzyRules = [
{% for rule in FUZZY_RULES %} {
match: '{{ rule['match'] }}',
replace: '{{ rule['replace'] }}',
},
{% endfor %}
];
"""
js_parent = Path(__file__).joinpath("../../javascript/src").resolve()
if not js_parent.exists():
# This skip is usefull mostly for CI operations when working on the Python part
print("Skipping JS rules generation, target folder is missing")
else:
(js_parent / "fuzzyRules.js").write_text(
JINJA_ENV.from_string(js_code_template).render(
FUZZY_RULES=[
{
"match": rule["pattern"].replace("\\", "\\\\"),
"replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]),
}
for rule in FUZZY_RULES
]
)
)
print("JS rules generation completed successfully")
### Generate Javascript tests
js_test_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
import test from 'ava';
import { applyFuzzyRules } from '../src/wombatSetup.js';
{% for rule in FUZZY_RULES %}
{% for test in rule['tests'] %}
test('fuzzyrules_{{rule['name']}}_{{loop.index}}', (t) => {
t.is(
applyFuzzyRules(
'{{test['raw_url']}}',
),
'{{test['raw_url'] if test['unchanged'] else test['fuzzified_url']}}',
);
});
{% endfor %}
{% endfor %}
"""
js_parent = Path(__file__).joinpath("../../javascript/test").resolve()
if not js_parent.exists():
# This skip is usefull mostly for CI operations when working on the Python part
print("Skipping JS tests generation, target folder is missing")
else:
(js_parent / "fuzzyRules.js").write_text(
JINJA_ENV.from_string(js_test_template).render(
FUZZY_RULES=[
{
"name": rule["name"],
"tests": rule["tests"],
"match": rule["pattern"].replace("\\", "\\\\"),
"replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]),
}
for rule in FUZZY_RULES
]
)
)
print("JS tests generation completed successfully")
### Generate Python code
py_code_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
FUZZY_RULES = [
{% for rule in FUZZY_RULES %} {
"pattern": r"{{ rule['pattern'] }}",
"replace": r"{{ rule['replace'] }}",
},
{% endfor %}
]
"""
py_parent = Path(__file__).joinpath("../../src/warc2zim").resolve()
if not py_parent.exists():
# This skip is usefull mostly for CI operations when working on the JS part
print("Skipping Python rules generation, target folder is missing")
else:
(py_parent / "rules.py").absolute().write_text(
JINJA_ENV.from_string(py_code_template).render(FUZZY_RULES=FUZZY_RULES)
)
print("Python rules generation completed successfully")
### Generate Python tests
py_test_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
import pytest
from warc2zim.url_rewriting import apply_fuzzy_rules
from .utils import ContentForTests
{% for rule in FUZZY_RULES %}
@pytest.fixture(
params=[
{% for test in rule['tests'] %}
{% if test['unchanged'] %}
ContentForTests(
"{{ test['raw_url'] }}",
),
{% else %}
ContentForTests(
"{{ test['raw_url'] }}",
"{{ test['fuzzified_url'] }}",
),
{% endif %}
{% endfor %}
]
)
def {{ rule['name'] }}_case(request):
yield request.param
def test_fuzzyrules_{{ rule['name'] }}({{ rule['name'] }}_case):
assert (
apply_fuzzy_rules({{ rule['name'] }}_case.input_str)
== {{ rule['name'] }}_case.expected_str
)
{% endfor %}
"""
py_parent = Path(__file__).joinpath("../../tests").resolve()
if not py_parent.exists():
# This skip is usefull mostly for CI operations when working on the JS part
print("Skipping Python tests generation, target folder is missing")
else:
(py_parent / "test_fuzzy_rules.py").absolute().write_text(
JINJA_ENV.from_string(py_test_template).render(FUZZY_RULES=FUZZY_RULES)
)
print("Python tests generation completed successfully")

View file

@ -1,213 +0,0 @@
# This file comes from an adaptation of rules present in
# https://github.com/webrecorder/wabac.js/blame/main/src/fuzzymatcher.js
#
# Syncing rules is done manually, based on expert knowledge, especially because in
# warc2zim we are not really fuzzy matching (searching the best entry among existing
# ones) but just rewriting to proper path.
#
# This file is in sync with content at commit 879018d5b96962df82340a9a57570bbc0fc67815
# from June 9, 2024
#
# This file should be updated at every release of warc2zim
#
# Some rules are voluntarily missing because not been tested in warc2zim yet: Twitter,
# Washington Post, WixStatic, Facebook
#
# Generic rules are also ommitted on purpose, we don't need them
#
fuzzyRules:
- name: googlevideo_com
pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).*
replace: youtube.fuzzy.replayweb.page/\1?\2
tests:
- raw_url: foobargooglevideo.com/videoplayback?id=1576&key=value
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
- raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
- raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576&key=value
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
- raw_url: foobargooglevideo.com/videoplaybackandfoo?some=thing&id=1576&key=value
unchanged: true # videoplayback is not followed by `?`
- raw_url: foobargoogle_video.com/videoplaybackandfoo?some=thing&id=1576&key=value
unchanged: true # No googlevideo.com in url
- name: youtube_video_info
pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).*
replace : youtube.fuzzy.replayweb.page/\1\2
tests:
- raw_url: www.youtube.com/get_video_info?video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: www.youtube.com/get_video_info?foo=bar&video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: www.youtube.com/get_video_info?video_id=123ah&foo=bar
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: youtube.com/get_video_info?video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: youtube-nocookie.com/get_video_info?video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: www.youtube-nocookie.com/get_video_info?video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: www.youtube-nocookie.com/get_video_info?foo=bar
unchanged: true # no video_id parameter
- raw_url: www.youtubeqnocookie.com/get_video_info?video_id=123ah
unchanged: true # improper hostname
- name: youtube_thumbnails
pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$)
replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
tests:
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.png?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.png
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/max-res.default.jpg
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
- name: trim_digits_only
pattern: ([^?]+)\?[\d]+$
replace : \1
tests:
- raw_url: www.example.com/page?1234
fuzzified_url: www.example.com/page
- raw_url: www.example.com/page?foo=1234
unchanged: true
- raw_url: www.example.com/page1234
unchanged: true
- raw_url: www.example.com/page?foo=bar&1234
unchanged: true
- raw_url: www.example.com/page?1234=bar
unchanged: true
- raw_url: www.example.com/page?1234&foo=bar
unchanged: true
- name: youtubei
pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*
replace : youtube.fuzzy.replayweb.page/\1?\2
tests:
- raw_url: www.youtube-nocookie.com/youtubei/page/?videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube-nocookie.com/youtubei/page/?videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/?videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: www.youtube.com/youtubei/page/?videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/videoIdqqq=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoIdqqq=123ah
- raw_url: youtube.com/youtubei/page/videoId=123ah&foo=bar
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/?foo=bar&videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/foo=bar&videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/foo=bar&?videoId=123ah
- raw_url: youtube.com/youtubei/?videoId=123ah
unchanged: true
- name: youtube_embed
pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*
replace : youtube.fuzzy.replayweb.page/embed/\1
tests:
- raw_url: www.youtube-nocookie.com/embed/foo
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- raw_url: www.youtube-nocookie.com/embed/bar
fuzzified_url: youtube.fuzzy.replayweb.page/embed/bar
- raw_url: www.youtube-nocookie.com/embed/foo/bar
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo/bar
- raw_url: www.youtube.com/embed/foo
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- raw_url: youtube.com/embed/foo
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- raw_url: youtube-nocookie.com/embed/foo
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- raw_url: youtube.com/embed/foo?bar=alice
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- name: vimeo_cdn_fix # custom warc2zim rule intended to fix Vimeo support
pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*\/(.+?.mp4)\?.*range=(.*?)(?:&.*|$)
replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
tests:
- raw_url: gcs-vimeo.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod-progressive.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod-adaptive.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/123.mp4?foo=bar&range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/123.mp4?foo=bar&range=123-456&bar=foo
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/123.mp4?range=123-456&bar=foo
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: foovod.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/1/23.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
- raw_url: vod.akamaized.net/a/23.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
- raw_url: vod.akamaized.net/foo/bar/23.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
- raw_url: foo.akamaized.net/123.mp4?range=123-456
unchanged: true
- name: vimeo_cdn
pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?\/([\d/]+.mp4)$
replace : vimeo-cdn.fuzzy.replayweb.page/\1
tests:
- raw_url: vod.akamaized.net/23.mp4
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4
- raw_url: vod.akamaized.net/23/12332.mp4
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23/12332.mp4
- raw_url: https://vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
- name: vimeo_player
pattern: .*player.vimeo.com\/(video\/[\d]+)\?.*
replace : vimeo.fuzzy.replayweb.page/\1
tests:
- raw_url: player.vimeo.com/video/1234?foo=bar
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
- raw_url: foo.player.vimeo.com/video/1234?foo=bar
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
- raw_url: player.vimeo.com/video/1234?foo
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
- raw_url: player.vimeo.com/video/1/23?foo=bar
unchanged: true
- raw_url: player.vimeo.com/video/123a?foo=bar
unchanged: true
- raw_url: player.vimeo.com/video/?foo=bar
unchanged: true
- name: i_vimeo_cdn
pattern: .*i\.vimeocdn\.com\/(.*)\?.*
replace : i.vimeocdn.fuzzy.replayweb.page/\1
tests:
- raw_url: i.vimeocdn.com/image/1234?foo=bar
fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/image/1234
- raw_url: i.vimeocdn.com/something/a456?foo
fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/something/a456
- name: cheatography_com
pattern: cheatography\.com\/scripts\/(.*).js.*[?&](v=[^&]+).*
replace : cheatography.com.fuzzy.replayweb.page/scripts/\1.js?\2
tests:
- raw_url: cheatography.com/scripts/useful.min.js?v=2&q=1719438924
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/useful.min.js?v=2
- raw_url: cheatography.com/scripts/foo.js?v=2&q=1719438924
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/foo.js?v=2
- raw_url: cheatography.com/scripts/useful.min.js?q=1719438924&v=2
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/useful.min.js?v=2
- raw_url: cheatography.com/scripts/useful.min.js?q=1719438924&v=2&foo=bar
fuzzified_url: cheatography.com.fuzzy.replayweb.page/scripts/useful.min.js?v=2
- name: der_postillon_com
pattern: blogger.googleusercontent.com\/img\/(.*\.jpg)=.*
replace: blogger.googleusercontent.com.fuzzy.replayweb.page/img/\1.resized
tests:
- raw_url: blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjlN4LY6kFVwL8-rinDWp3kJp1TowOVD8vq8TP8nl3Lf1sI-hx0DE1GQA1jw7DT7XvK3FjghzJ17_1pvyXyDBAV0vtigJRnFCNfMxnndBnN3NYoXUvKQQsQ7JTGXOSajdo0mNQIv8wss_AxPBMrR4-Dd_EEacV7ZMS3m_IL2dz0WsbbKn7FD7ntsfOe0JUq/s600-rw/tickerzugtier2.jpg=w487-h220-p-k-no-nu
fuzzified_url: blogger.googleusercontent.com.fuzzy.replayweb.page/img/b/R29vZ2xl/AVvXsEjlN4LY6kFVwL8-rinDWp3kJp1TowOVD8vq8TP8nl3Lf1sI-hx0DE1GQA1jw7DT7XvK3FjghzJ17_1pvyXyDBAV0vtigJRnFCNfMxnndBnN3NYoXUvKQQsQ7JTGXOSajdo0mNQIv8wss_AxPBMrR4-Dd_EEacV7ZMS3m_IL2dz0WsbbKn7FD7ntsfOe0JUq/s600-rw/tickerzugtier2.jpg.resized
- raw_url: blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjlN4LY6kFVwL8-rinDWp3kJp1TowOVD8vq8TP8nl3Lf1sI-hx0DE1GQA1jw7DT7XvK3FjghzJ17_1pvyXyDBAV0vtigJRnFCNfMxnndBnN3NYoXUvKQQsQ7JTGXOSajdo0mNQIv8wss_AxPBMrR4-Dd_EEacV7ZMS3m_IL2dz0WsbbKn7FD7ntsfOe0JUq/w72-h72-p-k-no-nu/tickerzugtier2.jpg
unchanged: true
- name: iranwire_com
pattern: (iranwire\.com\/questions\/detail\/.*)\?.*
replace: \1
tests:
- raw_url: iranwire.com/questions/detail/1723?&_=1721804954220
fuzzified_url: iranwire.com/questions/detail/1723
- raw_url: iranwire.com/questions/detail/1725?foo=bar&_=1721804454220
fuzzified_url: iranwire.com/questions/detail/1725

View file

@ -1,115 +0,0 @@
import re
from collections.abc import Iterable
from tinycss2 import (
ast,
parse_declaration_list,
parse_stylesheet,
parse_stylesheet_bytes,
serialize,
)
from tinycss2.serializer import serialize_url
from warc2zim.constants import logger
from warc2zim.content_rewriting.rx_replacer import RxRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter
class FallbackRegexCssRewriter(RxRewriter):
def __init__(self, url_rewriter: ArticleUrlRewriter, base_href: str | None):
rules = [
(
re.compile(r"""url\((?P<quote>['"])?(?P<url>.+?)(?P=quote)(?<!\\)\)"""),
lambda m_object, _opts: "".join(
[
"url(",
m_object["quote"],
url_rewriter(m_object["url"], base_href),
m_object["quote"],
")",
]
),
)
]
super().__init__(rules)
class CssRewriter:
def __init__(self, url_rewriter: ArticleUrlRewriter, base_href: str | None):
self.url_rewriter = url_rewriter
self.base_href = base_href
self.fallback_rewriter = FallbackRegexCssRewriter(url_rewriter, base_href)
def rewrite(self, content: str | bytes) -> str:
try:
if isinstance(content, bytes):
rules = parse_stylesheet_bytes(content)[0]
else:
rules = parse_stylesheet(content)
self.process_list(rules)
output = serialize(rules)
except Exception:
# If tinycss fail to parse css, it will generate a "Error" token.
# Exception is raised at serialization time.
# We try/catch the whole process to be sure anyway.
logger.warning(
(
"Css transformation fails. Fallback to regex rewriter.\n"
"Article path is %s"
),
self.url_rewriter.article_url,
)
return self.fallback_rewriter.rewrite(content, {})
return output
def rewrite_inline(self, content: str) -> str:
try:
rules = parse_declaration_list(content)
self.process_list(rules)
output = serialize(rules)
return output
except Exception:
# If tinycss fail to parse css, it will generate a "Error" token.
# Exception is raised at serialization time.
# We try/catch the whole process to be sure anyway.
logger.warning(
(
"Css transformation fails. Fallback to regex rewriter.\n"
"Content is `%s`"
),
content,
)
return self.fallback_rewriter.rewrite(content, {})
def process_list(self, components: Iterable[ast.Node]):
if components: # May be null
for component in components:
self.process(component)
def process(self, component: ast.Node):
if isinstance(
component,
ast.QualifiedRule
| ast.SquareBracketsBlock
| ast.ParenthesesBlock
| ast.CurlyBracketsBlock,
):
self.process_list(component.content)
elif isinstance(component, ast.FunctionBlock):
if component.lower_name == "url":
url_component = component.arguments[0]
new_url = self.url_rewriter(url_component.value, self.base_href)
url_component.value = new_url
url_component.representation = f'"{serialize_url(new_url)}"'
else:
self.process_list(component.arguments)
elif isinstance(component, ast.AtRule):
self.process_list(component.prelude)
self.process_list(component.content)
elif isinstance(component, ast.Declaration):
self.process_list(component.value)
elif isinstance(component, ast.URLToken):
new_url = self.url_rewriter(component.value, self.base_href)
component.value = new_url
component.representation = f"url({serialize_url(new_url)})"

View file

@ -1,665 +0,0 @@
import io
import re
from collections import namedtuple
from collections.abc import Callable
from dataclasses import dataclass
from functools import cache
from html import escape
from html.parser import HTMLParser
from inspect import Signature, signature
from bs4 import BeautifulSoup
from warc2zim.content_rewriting.css import CssRewriter
from warc2zim.content_rewriting.js import JsRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter, ZimPath
AttrNameAndValue = tuple[str, str | None]
AttrsList = list[AttrNameAndValue]
RewritenHtml = namedtuple("RewritenHmtl", ["title", "content"])
HTTP_EQUIV_REDIRECT_RE = re.compile(
r"^\s*(?P<interval>.*?)\s*;\s*url\s*=\s*(?P<url>.*?)\s*$"
)
def get_attr_value_from(
attrs: AttrsList, name: str, default: str | None = None
) -> str | None:
"""Get one HTML attribute value if present, else return default value"""
for attr_name, value in attrs:
if attr_name == name:
return value
return default
def format_attr(name: str, value: str | None) -> str:
"""Format a given attribute name and value, properly escaping the value"""
if value is None:
return name
html_escaped_value = escape(value, quote=True)
return f'{name}="{html_escaped_value}"'
def get_html_rewrite_context(tag: str, attrs: AttrsList) -> str:
"""Get current HTML rewrite context
By default, rewrite context is the HTML tag. But in some cases (e.g. script tags) we
need to be more precise since rewriting logic will vary based on another attribute
value (e.g. type attribute for script tags)
"""
if tag == "script":
script_type = get_attr_value_from(attrs, "type")
return {
"application/json": "json",
"json": "json",
"module": "js-module",
"application/javascript": "js-classic",
"text/javascript": "js-classic",
"": "js-classic",
}.get(script_type or "", "unknown")
elif tag == "link":
link_rel = get_attr_value_from(attrs, "rel")
if link_rel == "modulepreload":
return "js-module"
elif link_rel == "preload":
preload_type = get_attr_value_from(attrs, "as")
if preload_type == "script":
return "js-classic"
return tag
def extract_base_href(content: str) -> str | None:
"""Extract base href value from HTML content
This is done in a specific function before real parsing / rewriting of any HTML
because we need this information before rewriting any link since we might have stuff
before the <base> tag in html head (e.g. <link> for favicons)
"""
soup = BeautifulSoup(content, features="lxml")
if not soup.head:
return None
for base in soup.head.find_all("base"):
if base.has_attr("href"):
return base["href"]
return None
@cache
def _cached_signature(func: Callable) -> Signature:
"""Returns the signature of a given callable
Result is cached to save performance when reused multiple times
"""
return signature(func)
class HtmlRewriter(HTMLParser):
def __init__(
self,
url_rewriter: ArticleUrlRewriter,
pre_head_insert: str,
post_head_insert: str | None,
notify_js_module: Callable[[ZimPath], None],
):
super().__init__(convert_charrefs=False)
self.url_rewriter = url_rewriter
self.title = None
self.output = None
# This works only for tag without children.
# But as we use it to get the title, we are ok
self.html_rewrite_context = None
self.pre_head_insert = pre_head_insert
self.post_head_insert = post_head_insert
self.notify_js_module = notify_js_module
def rewrite(self, content: str) -> RewritenHtml:
if self.output is not None:
raise Exception("ouput should not already be set") # pragma: no cover
self.output = io.StringIO()
self.base_href = extract_base_href(content)
self.css_rewriter = CssRewriter(self.url_rewriter, self.base_href)
self.js_rewriter = JsRewriter(
url_rewriter=self.url_rewriter,
base_href=self.base_href,
notify_js_module=self.notify_js_module,
)
self.feed(content)
self.close()
output = self.output.getvalue()
self.output = None
return RewritenHtml(self.title or "", output)
def send(self, value: str):
self.output.write(value) # pyright: ignore[reportOptionalMemberAccess]
def handle_starttag(self, tag: str, attrs: AttrsList, *, auto_close: bool = False):
self.html_rewrite_context = get_html_rewrite_context(tag=tag, attrs=attrs)
if (
rewritten := rules._do_tag_rewrite(
tag=tag, attrs=attrs, auto_close=auto_close
)
) is not None:
self.send(rewritten)
return
self.send(f"<{tag}")
if attrs:
self.send(" ")
self.send(
" ".join(
format_attr(*attr)
for attr in (
rules._do_attribute_rewrite(
tag=tag,
attr_name=attr_name,
attr_value=attr_value,
attrs=attrs,
js_rewriter=self.js_rewriter,
css_rewriter=self.css_rewriter,
url_rewriter=self.url_rewriter,
base_href=self.base_href,
notify_js_module=self.notify_js_module,
)
for attr_name, attr_value in attrs
if not rules._do_drop_attribute(
tag=tag, attr_name=attr_name, attr_value=attr_value, attrs=attrs
)
)
)
)
if auto_close:
self.send(" />")
else:
self.send(">")
if tag == "head" and self.pre_head_insert:
self.send(self.pre_head_insert)
def handle_endtag(self, tag: str):
self.html_rewrite_context = None
if tag == "head" and self.post_head_insert:
self.send(self.post_head_insert)
self.send(f"</{tag}>")
def handle_startendtag(self, tag: str, attrs: AttrsList):
self.handle_starttag(tag, attrs, auto_close=True)
self.html_rewrite_context = None
def handle_data(self, data: str):
if self.html_rewrite_context == "title" and self.title is None:
self.title = data.strip()
if (
data.strip()
and (
rewritten := rules._do_data_rewrite(
html_rewrite_context=self.html_rewrite_context,
data=data,
css_rewriter=self.css_rewriter,
js_rewriter=self.js_rewriter,
url_rewriter=self.url_rewriter,
)
)
is not None
):
self.send(rewritten)
return
self.send(data)
def handle_entityref(self, name: str):
self.send(f"&{name};")
def handle_charref(self, name: str):
self.send(f"&#{name};")
def handle_comment(self, data: str):
self.send(f"<!--{data}-->")
def handle_decl(self, decl: str):
self.send(f"<!{decl}>")
def handle_pi(self, data: str):
self.send(f"<?{data}>")
def unknown_decl(self, data: str):
self.handle_decl(data)
DropAttributeCallable = Callable[..., bool]
RewriteAttributeCallable = Callable[..., AttrNameAndValue | None]
RewriteTagCallable = Callable[..., str | None]
RewriteDataCallable = Callable[..., str | None]
@dataclass(frozen=True)
class DropAttributeRule:
"""A rule specifying when an HTML attribute should be dropped"""
func: DropAttributeCallable
@dataclass(frozen=True)
class RewriteAttributeRule:
"""A rule specifying how a given HTML attribute should be rewritten"""
func: RewriteAttributeCallable
@dataclass(frozen=True)
class RewriteTagRule:
"""A rule specifying how a given HTML tag should be rewritten"""
func: RewriteTagCallable
@dataclass(frozen=True)
class RewriteDataRule:
"""A rule specifying how a given HTML data should be rewritten"""
func: RewriteDataCallable
def _check_decorated_func_signature(expected_func: Callable, decorated_func: Callable):
"""Checks if the decorated function signature is compatible
It checks that decorated function parameters have known names and proper types
"""
expected_params = _cached_signature(expected_func).parameters
func_params = _cached_signature(decorated_func).parameters
for name, param in func_params.items():
if name not in expected_params:
raise TypeError(
f"Parameter '{name}' is unsupported in function "
f"'{decorated_func.__name__}'"
)
if expected_params[name].annotation != param.annotation:
raise TypeError(
f"Parameter '{name}' in function '{decorated_func.__name__}' must be of"
f" type '{expected_params[name].annotation}'"
)
class HTMLRewritingRules:
"""A class holding the definitions of all rules to rewrite HTML documents"""
def __init__(self) -> None:
self.drop_attribute_rules: set[DropAttributeRule] = set()
self.rewrite_attribute_rules: set[RewriteAttributeRule] = set()
self.rewrite_tag_rules: set[RewriteTagRule] = set()
self.rewrite_data_rules: set[RewriteDataRule] = set()
def drop_attribute(
self,
) -> Callable[[DropAttributeCallable], DropAttributeCallable]:
"""Decorator to use when defining a rule regarding attribute dropping"""
def decorator(func: DropAttributeCallable) -> DropAttributeCallable:
_check_decorated_func_signature(self._do_drop_attribute, func)
self.drop_attribute_rules.add(DropAttributeRule(func=func))
return func
return decorator
def rewrite_attribute(
self,
) -> Callable[[RewriteAttributeCallable], RewriteAttributeCallable]:
"""Decorator to use when defining a rule regarding attribute rewriting"""
def decorator(func: RewriteAttributeCallable) -> RewriteAttributeCallable:
_check_decorated_func_signature(self._do_attribute_rewrite, func)
self.rewrite_attribute_rules.add(RewriteAttributeRule(func=func))
return func
return decorator
def rewrite_tag(
self,
) -> Callable[[RewriteTagCallable], RewriteTagCallable]:
"""Decorator to use when defining a rule regarding tag rewriting
This has to be used when we need to rewrite the whole start tag. It can also
handle rewrites of startend tags (autoclosing tags).
"""
def decorator(func: RewriteTagCallable) -> RewriteTagCallable:
_check_decorated_func_signature(self._do_tag_rewrite, func)
self.rewrite_tag_rules.add(RewriteTagRule(func=func))
return func
return decorator
def rewrite_data(
self,
) -> Callable[[RewriteDataCallable], RewriteDataCallable]:
"""Decorator to use when defining a rule regarding data rewriting
This has to be used when we need to rewrite the tag data.
"""
def decorator(func: RewriteDataCallable) -> RewriteDataCallable:
_check_decorated_func_signature(self._do_data_rewrite, func)
self.rewrite_data_rules.add(RewriteDataRule(func=func))
return func
return decorator
def _do_drop_attribute(
self, tag: str, attr_name: str, attr_value: str | None, attrs: AttrsList
) -> bool:
"""Utility function to process all attribute dropping rules
Returns true if at least one rule is matching
"""
return any(
rule.func(
**{
arg_name: arg_value
for arg_name, arg_value in {
"tag": tag,
"attr_name": attr_name,
"attr_value": attr_value,
"attrs": attrs,
}.items()
if arg_name in _cached_signature(rule.func).parameters
}
)
is True
for rule in self.drop_attribute_rules
)
def _do_attribute_rewrite(
self,
tag: str,
attr_name: str,
attr_value: str | None,
attrs: AttrsList,
js_rewriter: JsRewriter,
css_rewriter: CssRewriter,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
) -> AttrNameAndValue:
"""Utility function to process all attribute rewriting rules
Returns the rewritten attribute name and value
"""
if attr_value is None:
return attr_name, None
for rule in self.rewrite_attribute_rules:
if (
rewritten := rule.func(
**{
arg_name: arg_value
for arg_name, arg_value in {
"tag": tag,
"attr_name": attr_name,
"attr_value": attr_value,
"attrs": attrs,
"js_rewriter": js_rewriter,
"css_rewriter": css_rewriter,
"url_rewriter": url_rewriter,
"base_href": base_href,
"notify_js_module": notify_js_module,
}.items()
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
attr_name, attr_value = rewritten
return attr_name, attr_value
def _do_tag_rewrite(
self,
tag: str,
attrs: AttrsList,
*,
auto_close: bool,
) -> str | None:
"""Utility function to process all tag rewriting rules
Returns the rewritten tag
"""
for rule in self.rewrite_tag_rules:
if (
rewritten := rule.func(
**{
arg_name: arg_value
for arg_name, arg_value in {
"tag": tag,
"attrs": attrs,
"auto_close": auto_close,
}.items()
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
return rewritten
def _do_data_rewrite(
self,
html_rewrite_context: str | None,
data: str,
css_rewriter: CssRewriter,
js_rewriter: JsRewriter,
url_rewriter: ArticleUrlRewriter,
) -> str | None:
"""Utility function to process all data rewriting rules
Returns the rewritten data
"""
for rule in self.rewrite_data_rules:
if (
rewritten := rule.func(
**{
arg_name: arg_value
for arg_name, arg_value in {
"html_rewrite_context": html_rewrite_context,
"data": data,
"css_rewriter": css_rewriter,
"js_rewriter": js_rewriter,
"url_rewriter": url_rewriter,
}.items()
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
return rewritten
rules = HTMLRewritingRules()
@rules.drop_attribute()
def drop_script_integrity_attribute(tag: str, attr_name: str):
"""Drop integrity attribute in <script> tags"""
return tag == "script" and attr_name == "integrity"
@rules.drop_attribute()
def drop_link_integrity_attribute(tag: str, attr_name: str):
"""Drop integrity attribute in <link> tags"""
return tag == "link" and attr_name == "integrity"
@rules.rewrite_attribute()
def rewrite_meta_charset_content(
tag: str, attr_name: str, attrs: AttrsList
) -> AttrNameAndValue | None:
"""Rewrite charset indicated in meta tag
We need to rewrite both <meta charset='xxx'> and
<meta http-equiv='content-type' content='text/html; charset=xxx'>
"""
if tag != "meta":
return
if attr_name == "charset":
return (attr_name, "UTF-8")
if attr_name == "content" and any(
attr_name.lower() == "http-equiv"
and attr_value
and attr_value.lower() == "content-type"
for attr_name, attr_value in attrs
):
return (attr_name, "text/html; charset=UTF-8")
@rules.rewrite_attribute()
def rewrite_onxxx_tags(
attr_name: str, attr_value: str | None, js_rewriter: JsRewriter
) -> AttrNameAndValue | None:
"""Rewrite onxxx script attributes"""
if attr_value and attr_name.startswith("on") and not attr_name.startswith("on-"):
return (attr_name, js_rewriter.rewrite(attr_value))
@rules.rewrite_attribute()
def rewrite_style_tags(
attr_name: str, attr_value: str | None, css_rewriter: CssRewriter
) -> AttrNameAndValue | None:
"""Rewrite style attributes"""
if attr_value and attr_name == "style":
return (attr_name, css_rewriter.rewrite_inline(attr_value))
@rules.rewrite_attribute()
def rewrite_href_src_attributes(
tag: str,
attr_name: str,
attr_value: str | None,
attrs: AttrsList,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
):
"""Rewrite href and src attributes
This is also notifying of any JS script found used as a module, so that this script
is properly rewritten when encountered later on.
"""
if attr_name not in ("href", "src") or not attr_value:
return
if get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module":
notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
return (
attr_name,
url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
)
@rules.rewrite_attribute()
def rewrite_srcset_attribute(
attr_name: str,
attr_value: str | None,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
):
"""Rewrite srcset attributes"""
if attr_name != "srcset" or not attr_value:
return
value_list = attr_value.split(",")
new_value_list = []
for value in value_list:
url, *other = value.strip().split(" ", maxsplit=1)
new_url = url_rewriter(url, base_href=base_href)
new_value = " ".join([new_url, *other])
new_value_list.append(new_value)
return (attr_name, ", ".join(new_value_list))
@rules.rewrite_tag()
def rewrite_base_tag(tag: str, attrs: AttrsList, *, auto_close: bool):
"""Handle special case of <base> tag which have to be simplified (remove href)
This is special because resulting tag might be empty and hence needs to be dropped
"""
if tag != "base":
return
if get_attr_value_from(attrs, "href") is None:
return # needed so that other rules will be processed as well
values = " ".join(
format_attr(*attr)
for attr in [
(attr_name, attr_value)
for (attr_name, attr_value) in attrs
if attr_name != "href"
]
)
if values:
return f"<base {values}{'/>' if auto_close else '>'}"
else:
return "" # drop whole tag
@rules.rewrite_data()
def rewrite_css_data(
html_rewrite_context: str | None, data: str, css_rewriter: CssRewriter
) -> str | None:
"""Rewrite inline CSS"""
if html_rewrite_context != "style":
return
return css_rewriter.rewrite(data)
@rules.rewrite_data()
def rewrite_json_data(
html_rewrite_context: str | None,
) -> str | None:
"""Rewrite inline JSON"""
if html_rewrite_context != "json":
return
# we do not have any JSON rewriting left ATM since all these rules are applied in
# Browsertrix crawler before storing the WARC record for now
return
@rules.rewrite_data()
def rewrite_js_data(
html_rewrite_context: str | None,
data: str,
js_rewriter: JsRewriter,
) -> str | None:
"""Rewrite inline JS"""
if not (html_rewrite_context and html_rewrite_context.startswith("js-")):
return
return js_rewriter.rewrite(
data,
opts={"isModule": html_rewrite_context == "js-module"},
)
@rules.rewrite_attribute()
def rewrite_meta_http_equiv_redirect(
tag: str,
attr_name: str,
attr_value: str | None,
attrs: AttrsList,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
) -> AttrNameAndValue | None:
"""Rewrite redirect URL in meta http-equiv refresh"""
if tag != "meta":
return
if attr_name != "content":
return
if not attr_value:
return
http_equiv = get_attr_value_from(attrs, "http-equiv")
if http_equiv != "refresh":
return
if (match := HTTP_EQUIV_REDIRECT_RE.match(attr_value)) is None:
return
return (
attr_name,
f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
)

View file

@ -1,293 +0,0 @@
import re
from collections.abc import Callable, Iterable
from typing import Any
from warc2zim.content_rewriting.rx_replacer import (
RxRewriter,
TransformationAction,
TransformationRule,
add_prefix,
m2str,
replace,
replace_prefix_from,
)
from warc2zim.url_rewriting import ArticleUrlRewriter, ZimPath
# The regex used to rewrite `import ...` in module code.
IMPORT_MATCH_RX = re.compile(
r"""^\s*?import(?:['"\s]*(?:[\w*${}\s,]+from\s*)?['"\s]?['"\s])(?:.*?)['"\s]""",
)
# A sub regex used inside `import ...` rewrite to rewrite http url imported
IMPORT_HTTP_RX = re.compile(
r"""(import(?:['"\s]*(?:[\w*${}\s,]+from\s*)?['"\s]?['"\s]))((?:https?|[./]).*?)(['"\s])""",
)
# This list of global variables we want to wrap.
# We will setup the wrap only if the js script use them.
GLOBAL_OVERRIDES = [
"window",
"globalThis",
"self",
"document",
"location",
"top",
"parent",
"frames",
"opener",
]
GLOBALS_RX = re.compile(
r"("
+ "|".join([r"(?:^|[^$.])\b" + x + r"\b(?:$|[^$])" for x in GLOBAL_OVERRIDES])
+ ")"
)
# This will replace `this` in code. The `_____WB$wombat$check$this$function_____`
# will "see" with wombat and may return a "wrapper" around `this`
this_rw = "_____WB$wombat$check$this$function_____(this)"
def add_suffix_non_prop(suffix) -> TransformationAction:
"""
Create a rewrite_function which add a `suffix` to the match str.
The suffix is added only if the match is not preceded by `.` or `$`.
"""
def f(m_object, _opts):
offset = m_object.start()
if offset > 0 and m_object.string[offset - 1] in ".$":
return m_object[0]
return m_object[0] + suffix
return f
def replace_this() -> TransformationAction:
"""
Create a rewrite_function replacing "this" by `this_rw` in the matching str.
"""
return replace("this", this_rw)
def replace_this_non_prop() -> TransformationAction:
"""
Create a rewrite_function replacing "this" by `this_rw`.
Replacement happen only if "this" is not a property of an object.
"""
def f(m_object, _opts):
offset = m_object.start()
prev = m_object.string[offset - 1] if offset > 0 else ""
if prev == "\n":
return m_object[0].replace("this", ";" + this_rw)
if prev not in ".$":
return m_object[0].replace("this", this_rw)
return m_object[0]
return f
def replace_import(src, target) -> TransformationAction:
"""
Create a rewrite_function replacing `src` by `target` in the matching str.
This "replace" function is intended to be use to replace in `import ...` as it
adds a `import.meta.url` if we are in a module.
"""
def f(m_object, opts):
return m_object[0].replace(src, target) + (
"import.meta.url, " if opts and opts.get("isModule") else '"", '
)
return f
def create_js_rules() -> list[TransformationRule]:
"""
This function create all the transformation rules.
A transformation rule is a tuple (Regex, rewrite_function).
If the regex match in the rewritten script, the corresponding match object will be
passed to rewrite_function.
The rewrite_function must all take a `opts` dictionnary which will be the opts
passed to the `JsRewriter.rewrite` function.
This is mostly as if we were calling `re.sub(regex, rewrite_function, script_text)`.
The regex will be combined and will match any non overlaping text.
So rule to match will be applyed, potentially preventing futher rules to match.
"""
# This will replace `location = `. This will "see" with wombat and set what have to
# be set.
check_loc = (
"((self.__WB_check_loc && self.__WB_check_loc(location, arguments)) || "
"{}).href = "
)
# This will replace `eval(...)`.
eval_str = (
"WB_wombat_runEval2((_______eval_arg, isGlobal) => { var ge = eval; return "
"isGlobal ? ge(_______eval_arg) : "
"eval(_______eval_arg); }).eval(this, (function() { return arguments })(),"
)
return [
# rewriting `eval(...)` - invocation
(re.compile(r"(?:^|\s)\beval\s*\("), replace_prefix_from(eval_str, "eval")),
# rewriting `x = eval` - no invocation
(re.compile(r"[=]\s*\beval\b(?![(:.$])"), replace("eval", "self.eval")),
# rewriting `.postMessage` -> `__WB_pmw(self).postMessage`
(re.compile(r"\.postMessage\b\("), add_prefix(".__WB_pmw(self)")),
# rewriting `location = ` to custom expression `(...).href =` assignement
(
re.compile(r"[^$.]?\s?\blocation\b\s*[=]\s*(?![\s\d=])"),
add_suffix_non_prop(check_loc),
),
# rewriting `return this`
(re.compile(r"\breturn\s+this\b\s*(?![\s\w.$])"), replace_this()),
# rewriting `this.` special porperties access on new line, with ; perpended
# if prev chars is `\n`, or if prev is not `.` or `$`, no semi
(
re.compile(
rf"[^$.]\s?\bthis\b(?=(?:\.(?:{'|'.join(GLOBAL_OVERRIDES)})\b))"
),
replace_this_non_prop(),
),
# rewrite `= this` or `, this`
(re.compile(r"[=,]\s*\bthis\b\s*(?![\s\w:.$])"), replace_this()),
# rewrite `})(this_rw)`
(re.compile(r"\}(?:\s*\))?\s*\(this\)"), replace_this()),
# rewrite this in && or || expr
(
re.compile(r"[^|&][|&]{2}\s*this\b\s*(?![|\s&.$](?:[^|&]|$))"),
replace_this(),
),
# ignore `async import`.
# As the rule will match first, it will prevent next rule matching `import` to
# be apply to `async import`.
(re.compile(r"async\s+import\s*\("), m2str(lambda x: x)),
# esm dynamic import, if found, mark as module
(
re.compile(r"[^$.]\bimport\s*\("),
replace_import("import", "____wb_rewrite_import__"),
),
]
REWRITE_JS_RULES = create_js_rules()
class JsRewriter(RxRewriter):
"""
JsRewriter is in charge of rewriting the js code stored in our zim file.
"""
def __init__(
self,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
):
super().__init__(None)
self.first_buff = self._init_local_declaration(GLOBAL_OVERRIDES)
self.last_buff = "\n}"
self.url_rewriter = url_rewriter
self.notify_js_module = notify_js_module
self.base_href = base_href
def _init_local_declaration(self, local_decls: Iterable[str]) -> str:
"""
Create the prefix text to add at beginning of script.
This will be added to script only if the script is using of the declaration in
local_decls.
"""
assign_func = "_____WB$wombat$assign$function_____"
buffer = (
f"var {assign_func} = function(name) "
"{return (self._wb_wombat && self._wb_wombat.local_init && "
"self._wb_wombat.local_init(name)) || self[name]; };\n"
"if (!self.__WB_pmw) { self.__WB_pmw = function(obj) "
"{ this.__WB_source = obj; return this; } }\n{\n"
)
for decl in local_decls:
buffer += f"""let {decl} = {assign_func}("{decl}");\n"""
buffer += "let arguments;\n"
return buffer + "\n"
def _get_module_decl(self, local_decls: Iterable[str]) -> str:
"""
Create the prefix text to add at beginning of module script.
This will be added to script only if the script is a module script.
"""
wb_module_decl_url = self.url_rewriter.get_document_uri(
ZimPath("_zim_static/__wb_module_decl.js"), ""
)
return (
f"""import {{ {", ".join(local_decls)} }} from "{wb_module_decl_url}";\n"""
)
def rewrite(self, text: str, opts: dict[str, Any] | None = None) -> str:
"""
Rewrite the js code in `text`.
"""
opts = opts or {}
is_module = opts.get("isModule", False)
rules = REWRITE_JS_RULES[:]
if is_module:
rules.append(self._get_esm_import_rule())
self._compile_rules(rules)
new_text = super().rewrite(text, opts)
if is_module:
return self._get_module_decl(GLOBAL_OVERRIDES) + new_text
if GLOBALS_RX.search(text):
new_text = self.first_buff + new_text + self.last_buff
if opts.get("inline", False):
new_text = new_text.replace("\n", " ")
return new_text
def _get_esm_import_rule(self) -> TransformationRule:
def get_rewriten_import_url(url):
"""Rewrite the import URL
This takes into account that the result must be a relative URL, i.e. it
cannot be 'vendor.module.js' but must be './vendor.module.js'.
"""
url = self.url_rewriter(url, base_href=self.base_href)
if not (
url.startswith("/") or url.startswith("./") or url.startswith("../")
):
url = "./" + url
return url
def rewrite_import():
def func(m_object, _opts):
def sub_funct(match):
self.notify_js_module(
self.url_rewriter.get_item_path(
match.group(2), base_href=self.base_href
)
)
return (
f"{match.group(1)}{get_rewriten_import_url(match.group(2))}"
f"{match.group(3)}"
)
return IMPORT_HTTP_RX.sub(sub_funct, m_object[0])
return func
return (IMPORT_MATCH_RX, rewrite_import())

View file

@ -1,143 +0,0 @@
import re
from collections.abc import Callable, Iterable
from typing import Any
TransformationAction = Callable[[re.Match, dict], str]
TransformationRule = tuple[re.Pattern, TransformationAction]
def m2str(function) -> TransformationAction:
"""
Call a rewrite_function with a string instead of a match object.
A lot of rewrite function don't need the match object as they are working
directly on text. This decorator can be used on rewrite_function taking a str.
"""
def wrapper(m_object: re.Match, _opts: dict) -> str:
return function(m_object[0])
return wrapper
def add_around(prefix: str, suffix: str) -> TransformationAction:
"""
Create a rewrite_function which add a `prefix` and a `suffix` around the match.
"""
@m2str
def f(x):
return prefix + x + suffix
return f
def add_prefix(prefix: str) -> TransformationAction:
"""
Create a rewrite_function which add the `prefix` to the matching str.
"""
return add_around(prefix, "")
def add_suffix(suffix: str) -> TransformationAction:
"""
Create a rewrite_function which add the `suffix` to the matching str.
"""
return add_around("", suffix)
def replace_prefix_from(prefix: str, match: str) -> TransformationAction:
"""
Returns a function which replaces everything before `match` with `prefix`.
"""
@m2str
def f(x) -> str:
match_index = x.index(match)
if match_index == 0:
return prefix
return x[:match_index] + prefix
return f
def replace(src, target) -> TransformationAction:
"""
Create a rewrite_function replacing `src` by `target` in the matching str.
"""
@m2str
def f(x):
return x.replace(src, target)
return f
def replace_all(text: str) -> TransformationAction:
"""
Create a rewrite_function which replace the whole match with text.
"""
@m2str
def f(_x):
return text
return f
class RxRewriter:
"""
RxRewriter is a generic rewriter base on regex.
The main "input" is a list of rules, each rule being a tuple (regex,
rewriting_function). We want to apply each rule to the content. But doing it blindly
is counter-productive. It would means that we have to do N replacements (N == number
of rules).
To avoid that, we create one unique regex (`compiled_rule`) equivalent to
`(regex0|regex1|regex2|...)` and we do only one replacement with this regex.
When we have a match, we do N regex search to know which rules is corresponding
and we apply the associated rewriting_function.
"""
def __init__(
self,
rules: Iterable[TransformationRule] | None = None,
):
self.rules = rules or []
self.compiled_rule: re.Pattern | None = None
if self.rules:
self._compile_rules(self.rules)
def _compile_rules(self, rules: Iterable[TransformationRule]):
"""
Compile all the regex of the rules into only one `compiled_rules` pattern
"""
self.rules = rules
rx_buff = "|".join(f"({rule[0].pattern})" for rule in rules)
self.compiled_rule = re.compile(f"(?:{rx_buff})", re.M)
def rewrite(
self,
text: str | bytes,
opts: dict[str, Any],
) -> str:
"""
Apply the unique `compiled_rules` pattern and replace the content.
"""
if isinstance(text, bytes):
text = text.decode()
def replace(m_object):
"""
This method search for the specific rule which have matched and apply it.
"""
for i, rule in enumerate(self.rules, 1):
if not m_object.group(i):
# THis is not the ith rules which match
continue
result = rule[1](m_object, opts)
return result
assert self.compiled_rule is not None # noqa
return self.compiled_rule.sub(replace, text)

View file

@ -40,28 +40,21 @@ from jinja2 import Environment, PackageLoader
from warcio import ArchiveIterator
from warcio.recordloader import ArcWarcRecord
from zimscraperlib.constants import (
DEFAULT_DEV_ZIM_METADATA,
RECOMMENDED_MAX_TITLE_LENGTH,
)
from zimscraperlib.download import stream_file
from zimscraperlib.image.conversion import convert_image, convert_svg2png
from zimscraperlib.image.probing import format_for
from zimscraperlib.image.transformation import resize_image
from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
from zimscraperlib.types import FALLBACK_MIME
from zimscraperlib.zim import metadata
from zimscraperlib.zim.creator import Creator
from zimscraperlib.zim.metadata import (
validate_description,
validate_language,
validate_longdescription,
validate_tags,
validate_title,
)
from warc2zim.constants import logger
from warc2zim.icon_finder import Icon, get_sorted_icons, icons_in_html
from warc2zim.items import StaticArticle, StaticFile, WARCPayloadItem
from warc2zim.language import parse_language
from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
from warc2zim.utils import (
can_process_status_code,
get_record_content,
@ -140,7 +133,9 @@ class Converter:
}
self.source: str | None = str(args.source) if args.source else None or main_url
self.scraper = "warc2zim " + get_version()
self.main_path = normalize(HttpUrl(main_url)) if main_url else None
self.main_path = (
ArticleUrlRewriter.normalize(HttpUrl(main_url)) if main_url else None
)
self.output = Path(args.output)
self.zim_file = args.zim_file
@ -271,16 +266,16 @@ class Converter:
if not self.disable_metadata_checks:
# Validate ZIM metadata early so that we do not waste time doing operations
# for a scraper which will fail anyway in the end
validate_tags("Tags", self.tags)
metadata.TagsMetadata(self.tags)
if self.title:
validate_title("Title", self.title)
metadata.TitleMetadata(self.title)
if self.description:
validate_description("Description", self.description)
metadata.DescriptionMetadata(self.description)
if self.long_description:
validate_longdescription("LongDescription", self.long_description)
metadata.LongDescriptionMetadata(self.long_description)
if self.language:
self.language = parse_language(self.language)
validate_language("Language", self.language)
metadata.LanguageMetadata(self.language)
# Nota: we do not validate illustration since logic in the scraper is made
# to always provide a valid image, at least a fallback transparent PNG and
# final illustration is most probably not yet known at this stage
@ -303,7 +298,7 @@ class Converter:
self.language = "eng"
# validate language definitely, could have been retrieved from WARC or fallback
validate_language("Language", self.language)
metadata.LanguageMetadata(self.language)
if not self.main_path:
raise ValueError("Unable to find main path, aborting")
self.title = self.title or "Untitled"
@ -335,43 +330,64 @@ class Converter:
)
self.creator.config_metadata(
Name=self.name,
Language=self.language or "eng",
Title=self.title,
Description=self.description,
LongDescription=self.long_description,
Creator=self.creator_metadata,
Publisher=self.publisher,
Date=datetime.date.today(), # noqa: DTZ011
Illustration_48x48_at_1=self.illustration,
Tags=self.tags,
Source=self.source,
Scraper=",".join(
filter(
lambda x: x, # remove None values
[
f"warc2zim {get_version()}",
self.warc_software,
self.scraper_suffix,
],
)
metadata.StandardMetadataList(
Name=metadata.NameMetadata(self.name),
Language=metadata.LanguageMetadata(self.language),
Title=metadata.TitleMetadata(self.title),
Description=metadata.DescriptionMetadata(self.description),
LongDescription=(
metadata.LongDescriptionMetadata(self.long_description)
if self.long_description
else None
),
Creator=metadata.CreatorMetadata(self.creator_metadata),
Publisher=metadata.PublisherMetadata(self.publisher),
Date=metadata.DateMetadata(
datetime.datetime.now(tz=datetime.UTC).date()
),
Illustration_48x48_at_1=metadata.DefaultIllustrationMetadata(
self.illustration
),
Tags=(metadata.TagsMetadata(self.tags) if self.tags else None),
Scraper=metadata.ScraperMetadata(
",".join(
filter(
lambda x: x, # remove None values
[
f"warc2zim {get_version()}",
self.warc_software,
self.scraper_suffix,
],
)
)
),
),
).start()
if self.warc_start and self.warc_end:
if self.warc_start == self.warc_end:
self.creator.add_metadata(
"X-ContentDate", self.warc_start.strftime("%Y-%m-%d")
metadata.XCustomTextMetadata(
"X-ContentDate", self.warc_start.strftime("%Y-%m-%d")
)
)
else:
self.creator.add_metadata(
"X-ContentDate",
f"{self.warc_start.strftime('%Y-%m-%d')},"
f"{self.warc_end.strftime('%Y-%m-%d')}",
metadata.XCustomTextMetadata(
"X-ContentDate",
f"{self.warc_start.strftime('%Y-%m-%d')},"
f"{self.warc_end.strftime('%Y-%m-%d')}",
)
)
for filename in importlib.resources.files("warc2zim.statics").iterdir():
for filename in importlib.resources.files(
"zimscraperlib.rewriting.statics"
).iterdir():
if not filename.is_file():
continue
with importlib.resources.as_file(filename) as file:
if file.suffix != ".js":
continue
self.creator.add_item(
StaticArticle(filename=file, main_path=self.main_path.value)
)
@ -474,7 +490,7 @@ class Converter:
if not (url.startswith("http://") or url.startswith("https://")):
continue
zim_path = normalize(HttpUrl(url))
zim_path = ArticleUrlRewriter.normalize(HttpUrl(url))
status_code = get_status_code(record)
if not can_process_status_code(status_code):
@ -493,7 +509,7 @@ class Converter:
if zim_path not in self.redirections:
if redirect_location := record.http_headers.get("Location"):
try:
redirection_zim_path = normalize(
redirection_zim_path = ArticleUrlRewriter.normalize(
HttpUrl(urljoin(url, redirect_location))
)
# Redirection to same ZIM path have to be ignored (occurs
@ -563,7 +579,7 @@ class Converter:
HTTPStatus.FOUND,
]:
original_path = self.main_path
self.main_path = normalize(
self.main_path = ArticleUrlRewriter.normalize(
HttpUrl(
urljoin(
get_record_url(record),
@ -708,7 +724,8 @@ class Converter:
# compute paths of favicons so that we can process them on-the-fly while
# iterating the records
self.favicon_paths = {
normalize(icon_url): icon_url for icon_url in self.favicon_urls
ArticleUrlRewriter.normalize(icon_url): icon_url
for icon_url in self.favicon_urls
}
self.favicon_contents: dict[HttpUrl, bytes | None] = {
icon_url: None for icon_url in self.favicon_urls
@ -875,7 +892,9 @@ class Converter:
# Or fallback to default ZIM illustration
logger.warning("No suitable illustration found, using default")
self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"]
self.illustration = (
metadata.DEFAULT_DEV_ZIM_METADATA.Illustration_48x48_at_1.value
)
def is_self_redirect(self, record, url):
if record.rec_type != "response":
@ -889,7 +908,9 @@ class Converter:
location = record.http_headers.get("Location", "")
location = urljoin(url, location)
return normalize(HttpUrl(url)) == normalize(HttpUrl(location))
return ArticleUrlRewriter.normalize(
HttpUrl(url)
) == ArticleUrlRewriter.normalize(HttpUrl(location))
def add_items_for_warc_record(self, record):
@ -908,7 +929,7 @@ class Converter:
logger.debug(f"Skipping record with non HTTP(S) WARC-Target-URI {url}")
return
item_zim_path = normalize(HttpUrl(url))
item_zim_path = ArticleUrlRewriter.normalize(HttpUrl(url))
# if include_domains is set, only include urls from those domains
if self.include_domains:
@ -981,7 +1002,7 @@ class Converter:
and record.rec_headers["WARC-Refers-To-Target-URI"] != url
and item_zim_path not in self.revisits
): # pragma: no branch
self.revisits[item_zim_path] = normalize(
self.revisits[item_zim_path] = ArticleUrlRewriter.normalize(
HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"])
)

View file

@ -11,11 +11,11 @@ from pathlib import Path
from jinja2.environment import Template
from libzim.writer import Hint # pyright: ignore[reportMissingModuleSource]
from warcio.recordloader import ArcWarcRecord
from zimscraperlib.rewriting.url_rewriting import ZimPath
from zimscraperlib.types import get_mime_for_name
from zimscraperlib.zim.items import StaticItem
from warc2zim.content_rewriting.generic import Rewriter
from warc2zim.url_rewriting import ZimPath
from warc2zim.rewriting import Rewriter
from warc2zim.utils import get_record_mime_type

View file

@ -1,4 +1,4 @@
from zimscraperlib.i18n import get_language_details
from zimscraperlib.i18n import get_language_or_none
from warc2zim.constants import logger
@ -13,17 +13,19 @@ def parse_language(input_lang: str) -> str:
Preserve language ordering (since it conveys meaning in ZIM metadata).
"""
langs = [] # use a list to preserve order
# transform input language into Language object (or None if not found)
langs = [get_language_or_none(lang.strip()) for lang in input_lang.split(",")]
for lang in [lang.strip() for lang in input_lang.split(",")]:
try:
lang_data = get_language_details(lang)
if parsed_lang := (lang_data.iso_639_3 if lang_data else None):
if parsed_lang not in langs:
langs.append(parsed_lang)
except Exception:
logger.warning(f"Skipping invalid language setting `{lang}`.")
continue # skip unrecognized
# get unique iso_639_3 codes, removing duplicates and None values, preserving order
langs = list(
dict.fromkeys(
[
lang.iso_639_3
for lang in langs
if lang is not None and lang.iso_639_3 is not None
]
)
)
if len(langs) == 0:
logger.warning(

View file

@ -5,12 +5,12 @@ from urllib.parse import quote, urlsplit
from jinja2.environment import Template
from warcio.recordloader import ArcWarcRecord
from zimscraperlib.rewriting.css import CssRewriter
from zimscraperlib.rewriting.html import HtmlRewriter
from zimscraperlib.rewriting.js import JsRewriter
from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
from warc2zim.constants import logger
from warc2zim.content_rewriting.css import CssRewriter
from warc2zim.content_rewriting.html import HtmlRewriter
from warc2zim.content_rewriting.js import JsRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
from warc2zim.utils import (
get_record_content,
get_record_encoding,
@ -76,7 +76,9 @@ class Rewriter:
self.path = path
self.orig_url_str = get_record_url(record)
self.url_rewriter = ArticleUrlRewriter(
HttpUrl(self.orig_url_str), existing_zim_paths, missing_zim_paths
article_url=HttpUrl(self.orig_url_str),
existing_zim_paths=existing_zim_paths,
missing_zim_paths=missing_zim_paths,
)
self.rewrite_mode = self.get_rewrite_mode(record, mimetype)

View file

@ -1,36 +0,0 @@
var wrapObj = function (name) {
return (
(self._wb_wombat &&
self._wb_wombat.local_init &&
self._wb_wombat.local_init(name)) ||
self[name]
);
};
if (!self.__WB_pmw) {
self.__WB_pmw = function (obj) {
this.__WB_source = obj;
return this;
};
}
const window = wrapObj("window");
const document = wrapObj("document");
const location = wrapObj("location");
const top = wrapObj("top");
const parent = wrapObj("parent");
const frames = wrapObj("frames");
const opener = wrapObj("opener");
const __self = wrapObj("self");
const __globalThis = wrapObj("globalThis");
export {
window,
document,
location,
top,
parent,
frames,
opener,
__self as self,
__globalThis as globalThis,
};

View file

@ -1,370 +0,0 @@
#!/usr/bin/env python
# vim: ai ts=4 sts=4 et sw=4 nu
""" warc2zim's url rewriting tools
This module is about url and entry path rewriting.
The global scheme is the following:
Entries are stored in the ZIM file using their decoded fully decoded path:
- The full path is the full url without the scheme, username, password, port, fragment
(ie : "<host>/<path>(?<query_string)"). See documentation of the `normalize` function
for more details.
- urldecoded: the path itself must not be urlencoded or it would conflict with ZIM
specification and readers won't be able to retrieve it, some parts (e.g. querystring)
might be absorbed by a web server, ...
. This is valid : "foo/part with space/bar?key=value"
. This is NOT valid : "foo/part%20with%20space/bar%3Fkey%3Dvalue"
- even having multiple ? in a ZIM path is valid
. This is valid :
"foo/part/file with ? and +?who=Chip&Dale&question=It there any + here?"
. This is NOT valid :
"foo/part/file with %3F and +?who=Chip%26Dale&quer=Is%20there%20any%20%2B%20here%3F"
- space in query string must be stored as ` `, not `%2B`, `%20` or `+`, the `+` in a ZIM
path means a `%2B in web resource (HTML document, ...):
. This is valid : "foo/part/file?question=Is there any + here?"
. This is NOT valid : "foo/part/file?question%3DIs%20there%20any%20%2B%20here%3F"
On top of that, fuzzy rules are applied on the ZIM path:
For instance a path "https://www.youtube.com/youtubei/v1/foo/baz/things?key=value
&other_key=other_value&videoId=xxxx&yet_another_key=yet_another_value"
is transformed to "youtube.fuzzy.replayweb.page/youtubei/v1/foo/baz/things?videoId=xxxx"
by slightly simplifying the path and keeping only the usefull arguments in the
querystring.
When rewriting documents (HTML, CSS, JS, ...), every time we find a URI to rewrite we
start by resolving it into an absolute URL (based on the containing document absolute
URI), applying the transformation to compute the corresponding ZIM path and we
url-encode the whole ZIM path, so that readers will have one single blob to process,
url-decode and find corresponding ZIM entry. Only '/' separators are considered safe
and not url-encoded.
"""
from __future__ import annotations
import re
from pathlib import PurePosixPath
from urllib.parse import quote, unquote, urljoin, urlsplit, urlunsplit
import idna
from warc2zim.constants import logger
from warc2zim.rules import FUZZY_RULES
COMPILED_FUZZY_RULES = [
{"match": re.compile(rule["pattern"]), "replace": rule["replace"]}
for rule in FUZZY_RULES
]
class HttpUrl:
"""A utility class representing an HTTP url, usefull to pass this data around
Includes a basic validation, ensuring that URL is encoded, scheme is provided.
"""
def __init__(self, value: str) -> None:
HttpUrl.check_validity(value)
self._value = value
def __eq__(self, __value: object) -> bool:
return isinstance(__value, HttpUrl) and __value.value == self.value
def __hash__(self) -> int:
return self.value.__hash__()
def __str__(self) -> str:
return f"HttpUrl({self.value})"
@property
def value(self) -> str:
return self._value
@classmethod
def check_validity(cls, value: str) -> None:
parts = urlsplit(value)
if parts.scheme.lower() not in ["http", "https"]:
raise ValueError(
f"Incorrect HttpUrl scheme in value: {value} {parts.scheme}"
)
if not parts.hostname:
raise ValueError(f"Unsupported empty hostname in value: {value}")
if parts.hostname.lower() != parts.hostname:
raise ValueError(f"Unsupported upper-case chars in hostname : {value}")
class ZimPath:
"""A utility class representing a ZIM path, usefull to pass this data around
Includes a basic validation, ensuring that path does start with scheme, hostname,...
"""
def __init__(self, value: str) -> None:
ZimPath.check_validity(value)
self._value = value
def __eq__(self, __value: object) -> bool:
return isinstance(__value, ZimPath) and __value.value == self.value
def __hash__(self) -> int:
return self.value.__hash__()
def __str__(self) -> str:
return f"ZimPath({self.value})"
@property
def value(self) -> str:
return self._value
@classmethod
def check_validity(cls, value: str) -> None:
parts = urlsplit(value)
if parts.scheme:
raise ValueError(f"Unexpected scheme in value: {value} {parts.scheme}")
if parts.hostname:
raise ValueError(f"Unexpected hostname in value: {value} {parts.hostname}")
if parts.username:
raise ValueError(f"Unexpected username in value: {value} {parts.username}")
if parts.password:
raise ValueError(f"Unexpected password in value: {value} {parts.password}")
def apply_fuzzy_rules(uri: HttpUrl | str) -> str:
"""Apply fuzzy rules on a URL or relative path
First matching fuzzy rule matching the input value is applied and its result
is returned.
If no fuzzy rule is matching, the input is returned as-is.
"""
value = uri.value if isinstance(uri, HttpUrl) else uri
for rule in COMPILED_FUZZY_RULES:
if match := rule["match"].match(value):
return match.expand(rule["replace"])
return value
def normalize(url: HttpUrl) -> ZimPath:
"""Transform a HTTP URL into a ZIM path to use as a entry's key.
According to RFC 3986, a URL allows only a very limited set of characters, so we
assume by default that the url is encoded to match this specification.
The transformation rewrites the hostname, the path and the querystring.
The transformation drops the URL scheme, username, password, port and fragment:
- we suppose there is no conflict of URL scheme or port: there is no two ressources
with same hostname, path and querystring but different URL scheme or port leading
to different content
- we consider username/password port are purely authentication mechanism which have
no impact on the content to server
- we know that the fragment is never passed to the server, it stays in the
User-Agent, so if we encounter a fragment while normalizing a URL found in a
document, it won't make its way to the ZIM anyway and will stay client-side
The transformation consists mainly in decoding the three components so that ZIM path
is not encoded at all, as required by the ZIM specification.
Decoding is done differently for the hostname (decoded with puny encoding) and the
path and querystring (both decoded with url decoding).
The final transformation is the application of fuzzy rules (sourced from wabac) to
transform some URLs into replay URLs and drop some useless stuff.
Returned value is a ZIM path, without any puny/url encoding applied, ready to be
passed to python-libzim for UTF-8 encoding.
"""
url_parts = urlsplit(url.value)
if not url_parts.hostname:
raise Exception("Hostname is missing")
# decode the hostname if it is punny-encoded
hostname = (
idna.decode(url_parts.hostname)
if url_parts.hostname.startswith("xn--")
else url_parts.hostname
)
path = url_parts.path
if path:
# unquote the path so that it is stored unencoded in the ZIM as required by ZIM
# specification
path = unquote(path)
else:
# if path is empty, we need a "/" to remove ambiguities, e.g. https://example.com
# and https://example.com/ must all lead to the same ZIM entry to match RFC 3986
# section 6.2.3 : https://www.rfc-editor.org/rfc/rfc3986#section-6.2.3
path = "/"
query = url_parts.query
# if query is missing, we do not add it at all, not even a trailing ? without
# anything after it
if url_parts.query:
# `+`` in query parameter must be decoded as space first to remove ambiguities
# between a space (encoded as `+` in url query parameter) and a real plus sign
# (encoded as %2B but soon decoded in ZIM path)
query = query.replace("+", " ")
# unquote the query so that it is stored unencoded in the ZIM as required by ZIM
# specification
query = "?" + unquote(query)
else:
query = ""
fuzzified_url = apply_fuzzy_rules(
f"{hostname}{_remove_subsequent_slashes(path)}{_remove_subsequent_slashes(query)}"
)
return ZimPath(fuzzified_url)
def _remove_subsequent_slashes(value: str) -> str:
"""Remove all successive occurence of a slash `/` in a given string
E.g `val//ue` or `val///ue` or `val////ue` (and so on) are transformed into `value`
"""
return re.sub(r"//+", "/", value)
def get_without_fragment(url: str) -> str:
parsed = urlsplit(url)
return urlunsplit(parsed._replace(fragment=""))
class ArticleUrlRewriter:
"""Rewrite urls in article."""
def __init__(
self,
article_url: HttpUrl,
existing_zim_paths: set[ZimPath],
missing_zim_paths: set[ZimPath] | None = None,
):
self.article_path = normalize(article_url)
self.article_url = article_url
self.existing_zim_paths = existing_zim_paths
self.missing_zim_paths = missing_zim_paths
def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
"""Utility to transform an item URL into a ZimPath"""
item_absolute_url = urljoin(
urljoin(self.article_url.value, base_href), item_url
)
return normalize(HttpUrl(item_absolute_url))
def __call__(
self,
item_url: str,
base_href: str | None,
*,
rewrite_all_url: bool = True,
) -> str:
"""Rewrite a url contained in a article.
The url is "fully" rewrited to point to a normalized entry path
"""
try:
item_url = item_url.strip()
# Make case of standalone fragments more straightforward
if item_url.startswith("#"):
return item_url
item_scheme = urlsplit(item_url).scheme
if item_scheme and item_scheme not in ("http", "https"):
return item_url
item_absolute_url = urljoin(
urljoin(self.article_url.value, base_href), item_url
)
item_fragment = urlsplit(item_absolute_url).fragment
item_path = normalize(HttpUrl(item_absolute_url))
if rewrite_all_url or item_path in self.existing_zim_paths:
return self.get_document_uri(item_path, item_fragment)
else:
if (
self.missing_zim_paths is not None
and item_path not in self.missing_zim_paths
):
logger.debug(f"WARNING {item_path} ({item_url}) not in archive.")
# maintain a collection of missing Zim Path to not fill the logs
# with duplicate messages
self.missing_zim_paths.add(item_path)
# The url doesn't point to a known entry
return item_absolute_url
except Exception as exc:
item_scheme = item_scheme if "item_scheme" in locals() else "<not_set>"
item_absolute_url = (
item_absolute_url if "item_absolute_url" in locals() else "<not_set>"
)
item_fragment = (
item_fragment if "item_fragment" in locals() else "<not_set>"
)
item_path = item_path if "item_path" in locals() else "<not_set>"
logger.debug(
f"Invalid URL value found in {self.article_url.value}, kept as-is. "
f"(item_url: {item_url}, "
f"item_scheme: {item_scheme}, "
f"item_absolute_url: {item_absolute_url}, "
f"item_fragment: {item_fragment}, "
f"item_path: {item_path}, "
f"rewrite_all_url: {rewrite_all_url}",
exc_info=exc,
)
return item_url
def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
"""Given an ZIM item path and its fragment, get the URI to use in document
This function transforms the path of a ZIM item we want to adress from current
document (HTML / JS / ...) and returns the corresponding URI to use.
It computes the relative path based on current document location and escape
everything which needs to be to transform the ZIM path into a valid RFC 3986 URI
It also append a potential trailing item fragment at the end of the resulting
URI.
"""
item_parts = urlsplit(item_path.value)
# item_path is both path + querystring, both will be url-encoded in the document
# so that readers consider them as a whole and properly pass them to libzim
item_url = item_parts.path
if item_parts.query:
item_url += "?" + item_parts.query
relative_path = str(
PurePosixPath(item_url).relative_to(
(
PurePosixPath(self.article_path.value)
if self.article_path.value.endswith("/")
else PurePosixPath(self.article_path.value).parent
),
walk_up=True,
)
)
# relative_to removes a potential last '/' in the path, we add it back
if item_path.value.endswith("/"):
relative_path += "/"
return (
f"{quote(relative_path, safe='/')}"
f"{'#' + item_fragment if item_fragment else ''}"
)

View file

@ -1,88 +0,0 @@
from collections.abc import Callable
import pytest
from warc2zim.content_rewriting.css import CssRewriter
from warc2zim.content_rewriting.js import JsRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
@pytest.fixture(scope="module")
def no_js_notify():
"""Fixture to not care about notification of detection of a JS file"""
def no_js_notify_handler(_: str):
pass
yield no_js_notify_handler
class SimpleUrlRewriter(ArticleUrlRewriter):
"""Basic URL rewriter mocking most calls"""
def __init__(self, article_url: HttpUrl, suffix: str = ""):
self.article_url = article_url
self.suffix = suffix
def __call__(
self,
item_url: str,
base_href: str | None, # noqa: ARG002
*,
rewrite_all_url: bool = True, # noqa: ARG002
) -> str:
return item_url + self.suffix
def get_item_path(
self, item_url: str, base_href: str | None # noqa: ARG002
) -> ZimPath:
return ZimPath("")
def get_document_uri(
self, item_path: ZimPath, item_fragment: str # noqa: ARG002
) -> str:
return ""
@pytest.fixture(scope="module")
def simple_url_rewriter():
"""Fixture to create a basic url rewriter returning URLs as-is"""
def get_simple_url_rewriter(url: str, suffix: str = ""):
return SimpleUrlRewriter(HttpUrl(url), suffix=suffix)
yield get_simple_url_rewriter
@pytest.fixture(scope="module")
def js_rewriter():
"""Fixture to create a basic url rewriter returning URLs as-is"""
def get_js_rewriter(
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
):
return JsRewriter(
url_rewriter=url_rewriter,
base_href=base_href,
notify_js_module=notify_js_module,
)
yield get_js_rewriter
@pytest.fixture(scope="module")
def css_rewriter():
"""Fixture to create a basic url rewriter returning URLs as-is"""
def get_css_rewriter(
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
):
return CssRewriter(
url_rewriter=url_rewriter,
base_href=base_href,
)
yield get_css_rewriter

View file

@ -1,158 +0,0 @@
from textwrap import dedent
import pytest
from warc2zim.content_rewriting.css import CssRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl
from .utils import ContentForTests
@pytest.fixture(
params=[
ContentForTests(b"p { color: red; }"),
ContentForTests(b"p {\n color: red;\n}"),
ContentForTests(b"p { background: blue; }"),
ContentForTests(b"p { background: rgb(15, 0, 52); }"),
ContentForTests(
b"/* See bug issue at http://exemple.com/issue/link */ p { color: blue; }"
),
ContentForTests(
b"p { width= } div { background: url(http://exemple.com/img.png)}",
b"p { width= } div { background: url(../exemple.com/img.png)}",
),
ContentForTests(
b"p { width= } div { background: url('http://exemple.com/img.png')}",
b'p { width= } div { background: url("../exemple.com/img.png")}',
),
ContentForTests(
b'p { width= } div { background: url("http://exemple.com/img.png")}',
b'p { width= } div { background: url("../exemple.com/img.png")}',
),
]
)
def no_rewrite_content(request):
yield request.param
def test_no_rewrite(no_rewrite_content):
assert (
CssRewriter(
ArticleUrlRewriter(
HttpUrl(f"http://{no_rewrite_content.article_url}"), set()
),
base_href=None,
).rewrite(no_rewrite_content.input_bytes)
== no_rewrite_content.expected_bytes.decode()
)
@pytest.fixture(
params=[
ContentForTests('"border:'),
ContentForTests("border: solid 1px #c0c0c0; width= 100%"),
# Despite being invalid, tinycss parse it as "width" property without value.
ContentForTests("width:", "width:;"),
ContentForTests("border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"),
ContentForTests(
'background: url("http://exemple.com/foo.png"); width=',
'background: url("../exemple.com/foo.png"); width=',
),
]
)
def invalid_content_inline(request):
yield request.param
def test_invalid_css_inline(invalid_content_inline):
assert (
CssRewriter(
ArticleUrlRewriter(
HttpUrl(f"http://{invalid_content_inline.article_url}"), set()
),
base_href=None,
).rewrite_inline(invalid_content_inline.input_str)
== invalid_content_inline.expected_str
)
@pytest.fixture(
params=[
# Tinycss parse `"border:}` as a string with an unexpected eof in string.
# At serialization, tiny try to recover and close the opened rule
ContentForTests(b'p {"border:}', b'p {"border:}}'),
ContentForTests(b'"p {border:}'),
ContentForTests(b"p { border: solid 1px #c0c0c0; width= 100% }"),
ContentForTests(b"p { width: }"),
ContentForTests(
b"p { border-bottom-width: 1px;border-bottom-color: #c0c0c0;w }"
),
ContentForTests(
b'p { background: url("http://exemple.com/foo.png"); width= }',
b'p { background: url("../exemple.com/foo.png"); width= }',
),
]
)
def invalid_content(request):
yield request.param
def test_invalid_cssl(invalid_content):
assert (
CssRewriter(
ArticleUrlRewriter(HttpUrl(f"http://{invalid_content.article_url}"), set()),
base_href=None,
).rewrite(invalid_content.input_bytes)
== invalid_content.expected_bytes.decode()
)
def test_rewrite():
content = b"""
/* A comment with a link : http://foo.com */
@import url(//fonts.googleapis.com/icon?family=Material+Icons);
p, input {
color: rbg(1, 2, 3);
background: url('http://kiwix.org/super/img');
background-image:url('http://exemple.com/no_space_before_url');
}
@font-face {
src: url(https://f.gst.com/s/qa/v31/6xKtdSZaE8KbpRA_hJFQNcOM.woff2) format('woff2');
}
@media only screen and (max-width: 40em) {
p, input {
background-image:url();
}
}"""
expected = """
/* A comment with a link : http://foo.com */
@import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%20Icons);
p, input {
color: rbg(1, 2, 3);
background: url("super/img");
background-image:url("../exemple.com/no_space_before_url");
}
@font-face {
src: url(../f.gst.com/s/qa/v31/6xKtdSZaE8KbpRA_hJFQNcOM.woff2) format("woff2");
}
@media only screen and (max-width: 40em) {
p, input {
background-image:url();
}
}"""
expected = dedent(expected)
assert (
CssRewriter(
ArticleUrlRewriter(HttpUrl("http://kiwix.org/article"), set()),
base_href=None,
).rewrite(content)
== expected
)

File diff suppressed because it is too large Load diff

View file

@ -1,319 +0,0 @@
import pytest
from warc2zim.content_rewriting.js import JsRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
from .utils import ContentForTests
@pytest.fixture
def simple_js_rewriter(simple_url_rewriter, no_js_notify) -> JsRewriter:
return JsRewriter(
url_rewriter=simple_url_rewriter("http://www.example.com"),
base_href=None,
notify_js_module=no_js_notify,
)
@pytest.fixture(
params=[
"a = this;",
"return this.location",
'func(Function("return this"));',
"'a||this||that",
"(a,b,Q.contains(i[t], this))",
"a = this.location.href; exports.Foo = Foo; /* export className */",
]
)
def rewrite_this_js_content(request):
content = request.param
yield ContentForTests(
content,
content.replace("this", "_____WB$wombat$check$this$function_____(this)"),
)
def test_this_js_rewrite(simple_js_rewriter: JsRewriter, rewrite_this_js_content):
assert (
simple_js_rewriter.rewrite(rewrite_this_js_content.input_str)
== rewrite_this_js_content.expected_str
)
class WrappedTestContent(ContentForTests):
@staticmethod
def wrap_script(text: str) -> str:
"""
A small wrapper to help generate the expected content.
JsRewriter must add this local definition around all js code (when we access on
of the local varibles)
"""
return (
"var _____WB$wombat$assign$function_____ = function(name) {return (self."
"_wb_wombat && self._wb_wombat.local_init && self._wb_wombat.local_init"
"(name)) || self[name]; };\n"
"if (!self.__WB_pmw) { self.__WB_pmw = function(obj) { this.__WB_source ="
" obj; return this; } }\n"
"{\n"
'let window = _____WB$wombat$assign$function_____("window");\n'
'let globalThis = _____WB$wombat$assign$function_____("globalThis");\n'
'let self = _____WB$wombat$assign$function_____("self");\n'
'let document = _____WB$wombat$assign$function_____("document");\n'
'let location = _____WB$wombat$assign$function_____("location");\n'
'let top = _____WB$wombat$assign$function_____("top");\n'
'let parent = _____WB$wombat$assign$function_____("parent");\n'
'let frames = _____WB$wombat$assign$function_____("frames");\n'
'let opener = _____WB$wombat$assign$function_____("opener");\n'
"let arguments;\n"
"\n"
f"{text}"
"\n"
"}"
)
def __post_init__(self):
super().__post_init__()
self.expected = self.wrap_script(self.expected_str)
@pytest.fixture(
params=[
WrappedTestContent(
"location = http://example.com/",
"location = ((self.__WB_check_loc && self.__WB_check_loc(location, argument"
"s)) || {}).href = http://example.com/",
),
WrappedTestContent(
" location = http://example.com/2",
" location = ((self.__WB_check_loc && self.__WB_check_loc(location, argumen"
"ts)) || {}).href = http://example.com/2",
),
WrappedTestContent("func(location = 0)", "func(location = 0)"),
WrappedTestContent(
" location = http://example.com/2",
" location = ((self.__WB_check_loc && self.__WB_check_loc(location, argumen"
"ts)) || {}).href = http://example.com/2",
),
WrappedTestContent("window.eval(a)", "window.eval(a)"),
WrappedTestContent("x = window.eval; x(a);", "x = window.eval; x(a);"),
WrappedTestContent(
"this. location = 'http://example.com/'",
"this. location = 'http://example.com/'",
),
WrappedTestContent(
"if (self.foo) { console.log('blah') }",
"if (self.foo) { console.log('blah') }",
),
WrappedTestContent("window.x = 5", "window.x = 5"),
]
)
def rewrite_wrapped_content(request):
yield request.param
def test_wrapped_rewrite(simple_js_rewriter: JsRewriter, rewrite_wrapped_content):
assert (
simple_js_rewriter.rewrite(rewrite_wrapped_content.input_str)
== rewrite_wrapped_content.expected_str
)
class ImportTestContent(ContentForTests):
@staticmethod
# We want to import js stored in zim file as `_zim_static/__wb_module_decl.js` from
# `https://exemple.com/some/path/` so path is
# `../../../_zim_static/__wb_module_decl.js`
def wrap_import(text: str) -> str:
"""
A small wrapper to help us generate the expected content for modules.
JsRewriter must add this import line at beginning of module codes (when code
contains `import` or `export`)
"""
return (
"import { window, globalThis, self, document, location, top, parent, "
'frames, opener } from "../../../_zim_static/__wb_module_decl.js";\n'
f"{text}"
)
def __post_init__(self):
super().__post_init__()
self.article_url = "https://exemple.com/some/path/"
self.expected = self.wrap_import(self.expected_str)
@pytest.fixture(
params=[
# import rewrite
ImportTestContent(
"""import "foo";
a = this.location""",
"""import "foo";
a = _____WB$wombat$check$this$function_____(this).location""",
),
# import/export module rewrite
ImportTestContent(
"""a = this.location
export { a };
""",
"""a = _____WB$wombat$check$this$function_____(this).location
export { a };
""",
),
# rewrite ESM module import
ImportTestContent(
'import "https://example.com/file.js"',
'import "../../../example.com/file.js"',
),
ImportTestContent(
'''
import {A, B}
from
"https://example.com/file.js"''',
'''
import {A, B}
from
"../../../example.com/file.js"''',
),
ImportTestContent(
"""
import * from "https://example.com/file.js"
import A from "http://example.com/path/file2.js";
import {C, D} from "./abc.js";
import {X, Y} from "../parent.js";
import {E, F, G} from "/path.js";
import { Z } from "../../../path.js";
B = await import(somefile);
""",
"""
import * from "../../../example.com/file.js"
import A from "../../../example.com/path/file2.js";
import {C, D} from "./abc.js";
import {X, Y} from "../parent.js";
import {E, F, G} from "../../path.js";
import { Z } from "../../path.js";
B = await ____wb_rewrite_import__(import.meta.url, somefile);
""",
),
ImportTestContent(
'import"import.js";import{A, B, C} from"test.js";(function() => { frames[0]'
'.href = "/abc"; })',
'import"import.js";import{A, B, C} from"test.js";(function() => { frames[0]'
'.href = "/abc"; })',
),
ImportTestContent(
"""a = location
export{ a, $ as b};
""",
"""a = location
export{ a, $ as b};
""",
),
]
)
def rewrite_import_content(request):
yield request.param
def test_import_rewrite(no_js_notify, rewrite_import_content):
url_rewriter = ArticleUrlRewriter(
HttpUrl(rewrite_import_content.article_url), set()
)
assert (
JsRewriter(
url_rewriter=url_rewriter, base_href=None, notify_js_module=no_js_notify
).rewrite(rewrite_import_content.input_str, opts={"isModule": True})
== rewrite_import_content.expected_str
)
@pytest.fixture(
params=[
"return this.abc",
"return this object",
"a = 'some, this object'",
"{foo: bar, this: other}",
"this.$location = http://example.com/",
"this. $location = http://example.com/",
"this. _location = http://example.com/",
"this. alocation = http://example.com/",
"this.location = http://example.com/",
",eval(a)",
"this.$eval(a)",
"x = $eval; x(a);",
"obj = { eval : 1 }",
"x = obj.eval",
"x = obj.eval(a)",
"x = obj._eval(a)",
"x = obj.$eval(a)",
"if (a.self.foo) { console.log('blah') }",
"a.window.x = 5",
" postMessage({'a': 'b'})",
"simport(5);",
"a.import(5);",
"$import(5);",
"async import(val) { ... }",
"""function blah() {
const text = "text: import a from B.js";
}
""",
"""function blah() {
const text = `
import a from "https://example.com/B.js"
`;
}
""",
"let a = 7; var b = 5; const foo = 4;\n\n",
]
)
def no_rewrite_js_content(request):
yield request.param
def test_no_rewrite(simple_js_rewriter: JsRewriter, no_rewrite_js_content):
assert simple_js_rewriter.rewrite(no_rewrite_js_content) == no_rewrite_js_content
@pytest.mark.parametrize(
"js_src,expected_js_module_path",
[
("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"),
("../my-module-script.js", "kiwix.org/my-module-script.js"),
("../../../my-module-script.js", "kiwix.org/my-module-script.js"),
("/my-module-script.js", "kiwix.org/my-module-script.js"),
("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"),
(
"https://myserver.com/my-module-script.js",
"myserver.com/my-module-script.js",
),
],
)
def test_js_rewrite_nested_module_detected(js_src, expected_js_module_path):
js_modules = []
def custom_notify(zim_path: ZimPath):
js_modules.append(zim_path)
url_rewriter = ArticleUrlRewriter(
HttpUrl("http://kiwix.org/my_folder/my_article.html"), set()
)
JsRewriter(
url_rewriter=url_rewriter, base_href=None, notify_js_module=custom_notify
).rewrite(f'import * from "{js_src}"', opts={"isModule": True})
assert len(js_modules) == 1
assert js_modules[0].value == expected_js_module_path

View file

@ -15,7 +15,7 @@ def test_title_validation(title, is_valid):
if is_valid:
assert main(args) == 100
else:
with pytest.raises(ValueError, match="Title is too long"):
with pytest.raises(ValueError, match="Title value is too long"):
main(args)
@ -35,7 +35,7 @@ def test_description_validation(description, is_valid):
if is_valid:
assert main(args) == 100
else:
with pytest.raises(ValueError, match="Description is too long"):
with pytest.raises(ValueError, match="Description value is too long"):
main(args)
@ -62,7 +62,7 @@ def test_long_description_validation(long_description, is_valid):
if is_valid:
assert main(args) == 100
else:
with pytest.raises(ValueError, match="Description is too long"):
with pytest.raises(ValueError, match="LongDescription value is too long"):
main(args)

View file

@ -4,9 +4,9 @@ import pytest
from jinja2 import Template
from warcio import StatusAndHeaders
from warcio.recordloader import ArcWarcRecord
from zimscraperlib.rewriting.url_rewriting import ZimPath
from warc2zim.content_rewriting.generic import Rewriter
from warc2zim.url_rewriting import ZimPath
from warc2zim.rewriting import Rewriter
@pytest.fixture(scope="module")

View file

@ -1,538 +0,0 @@
import pytest
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
@pytest.mark.parametrize(
"article_url, original_content_url, expected_rewriten_content_url, know_paths, "
"rewrite_all_url",
[
(
"https://kiwix.org/a/article/document.html",
"foo.html",
"foo.html",
["kiwix.org/a/article/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foo.html#anchor1",
"foo.html#anchor1",
["kiwix.org/a/article/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foo.html?foo=bar",
"foo.html%3Ffoo%3Dbar",
["kiwix.org/a/article/foo.html?foo=bar"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foo.html?foo=b%24ar",
"foo.html%3Ffoo%3Db%24ar",
["kiwix.org/a/article/foo.html?foo=b$ar"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foo.html?foo=b%3Far", # a query string with an encoded ? char in value
"foo.html%3Ffoo%3Db%3Far",
["kiwix.org/a/article/foo.html?foo=b?ar"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"fo%o.html",
"fo%25o.html",
["kiwix.org/a/article/fo%o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foé.html", # URL not matching RFC 3986 (many HTML documents are invalid)
"fo%C3%A9.html", # character is encoded so that URL match RFC 3986
["kiwix.org/a/article/foé.html"], # but ZIM path is non-encoded
False,
),
(
"https://kiwix.org/a/article/document.html",
"./foo.html",
"foo.html",
["kiwix.org/a/article/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"../foo.html",
"https://kiwix.org/a/foo.html", # Full URL since not in known URLs
["kiwix.org/a/article/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"../foo.html",
"../foo.html", # all URLs rewrite activated
["kiwix.org/a/article/foo.html"],
True,
),
(
"https://kiwix.org/a/article/document.html",
"../foo.html",
"../foo.html",
["kiwix.org/a/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"../bar/foo.html",
"https://kiwix.org/a/bar/foo.html", # Full URL since not in known URLs
["kiwix.org/a/article/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"../bar/foo.html",
"../bar/foo.html", # all URLs rewrite activated
["kiwix.org/a/article/foo.html"],
True,
),
(
"https://kiwix.org/a/article/document.html",
"../bar/foo.html",
"../bar/foo.html",
["kiwix.org/a/bar/foo.html"],
False,
),
( # we cannot go upper than host, so '../' in excess are removed
"https://kiwix.org/a/article/document.html",
"../../../../../foo.html",
"../../foo.html",
["kiwix.org/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foo?param=value",
"foo%3Fparam%3Dvalue",
["kiwix.org/a/article/foo?param=value"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foo?param=value%2F",
"foo%3Fparam%3Dvalue/",
["kiwix.org/a/article/foo?param=value/"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foo?param=value%2Fend",
"foo%3Fparam%3Dvalue/end",
["kiwix.org/a/article/foo?param=value/end"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"foo/",
"foo/",
["kiwix.org/a/article/foo/"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo o.html",
"../../fo%20o.html",
["kiwix.org/fo o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo+o.html",
"../../fo%2Bo.html",
["kiwix.org/fo+o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo%2Bo.html",
"../../fo%2Bo.html",
["kiwix.org/fo+o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/foo.html?param=val+ue",
"../../foo.html%3Fparam%3Dval%20ue",
["kiwix.org/foo.html?param=val ue"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo~o.html",
"../../fo~o.html",
["kiwix.org/fo~o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo-o.html",
"../../fo-o.html",
["kiwix.org/fo-o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo_o.html",
"../../fo_o.html",
["kiwix.org/fo_o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo%7Eo.html", # must not be encoded / must be decoded (RFC 3986 #2.3)
"../../fo~o.html",
["kiwix.org/fo~o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo%2Do.html", # must not be encoded / must be decoded (RFC 3986 #2.3)
"../../fo-o.html",
["kiwix.org/fo-o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/fo%5Fo.html", # must not be encoded / must be decoded (RFC 3986 #2.3)
"../../fo_o.html",
["kiwix.org/fo_o.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/foo%2Ehtml", # must not be encoded / must be decoded (RFC 3986 #2.3)
"../../foo.html",
["kiwix.org/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"#anchor1",
"#anchor1",
["kiwix.org/a/article/document.html"],
False,
),
(
"https://kiwix.org/a/article/",
"#anchor1",
"#anchor1",
["kiwix.org/a/article/"],
False,
),
(
"https://kiwix.org/a/article/",
"../article/",
"./",
["kiwix.org/a/article/"],
False,
),
],
)
def test_relative_url(
article_url,
know_paths,
original_content_url,
expected_rewriten_content_url,
rewrite_all_url,
):
article_url = HttpUrl(article_url)
rewriter = ArticleUrlRewriter(
article_url,
{ZimPath(path) for path in know_paths},
)
assert (
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
== expected_rewriten_content_url
)
@pytest.mark.parametrize(
"article_url, original_content_url, expected_rewriten_content_url, know_paths, "
"rewrite_all_url",
[
(
"https://kiwix.org/a/article/document.html",
"/foo.html",
"../../foo.html",
["kiwix.org/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/bar.html",
"https://kiwix.org/bar.html", # Full URL since not in known URLs
["kiwix.org/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"/bar.html",
"../../bar.html", # all URLs rewrite activated
["kiwix.org/foo.html"],
True,
),
],
)
def test_absolute_path_url(
article_url,
know_paths,
original_content_url,
expected_rewriten_content_url,
rewrite_all_url,
):
article_url = HttpUrl(article_url)
rewriter = ArticleUrlRewriter(
article_url,
{ZimPath(path) for path in know_paths},
)
assert (
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
== expected_rewriten_content_url
)
@pytest.mark.parametrize(
"article_url, original_content_url, expected_rewriten_content_url, know_paths, "
"rewrite_all_url",
[
(
"https://kiwix.org/a/article/document.html",
"//kiwix.org/foo.html",
"../../foo.html",
["kiwix.org/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"//kiwix.org/bar.html",
"https://kiwix.org/bar.html", # Full URL since not in known URLs
["kiwix.org/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"//kiwix.org/bar.html",
"../../bar.html", # all URLs rewrite activated
["kiwix.org/foo.html"],
True,
),
(
"https://kiwix.org/a/article/document.html",
"//acme.com/foo.html",
"../../../acme.com/foo.html",
["acme.com/foo.html"],
False,
),
(
"http://kiwix.org/a/article/document.html",
"//acme.com/bar.html",
"http://acme.com/bar.html", # Full URL since not in known URLs
["kiwix.org/foo.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"//acme.com/bar.html",
"../../../acme.com/bar.html", # all URLs rewrite activated
["kiwix.org/foo.html"],
True,
),
( # puny-encoded host is transformed into url-encoded value
"https://kiwix.org/a/article/document.html",
"//xn--exmple-cva.com/a/article/document.html",
"../../../ex%C3%A9mple.com/a/article/document.html",
["exémple.com/a/article/document.html"],
False,
),
( # host who should be puny-encoded ir transformed into url-encoded value
"https://kiwix.org/a/article/document.html",
"//exémple.com/a/article/document.html",
"../../../ex%C3%A9mple.com/a/article/document.html",
["exémple.com/a/article/document.html"],
False,
),
],
)
def test_absolute_scheme_url(
article_url,
know_paths,
original_content_url,
expected_rewriten_content_url,
rewrite_all_url,
):
article_url = HttpUrl(article_url)
rewriter = ArticleUrlRewriter(
article_url,
{ZimPath(path) for path in know_paths},
)
assert (
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
== expected_rewriten_content_url
)
@pytest.mark.parametrize(
"article_url, original_content_url, expected_rewriten_content_url, know_paths, "
"rewrite_all_url",
[
(
"https://kiwix.org/a/article/document.html",
"https://foo.org/a/article/document.html",
"../../../foo.org/a/article/document.html",
["foo.org/a/article/document.html"],
False,
),
(
"https://kiwix.org/a/article/document.html",
"http://foo.org/a/article/document.html",
"../../../foo.org/a/article/document.html",
["foo.org/a/article/document.html"],
False,
),
(
"http://kiwix.org/a/article/document.html",
"https://foo.org/a/article/document.html",
"../../../foo.org/a/article/document.html",
["foo.org/a/article/document.html"],
False,
),
(
"http://kiwix.org/a/article/document.html",
"https://user:password@foo.org:8080/a/article/document.html",
"../../../foo.org/a/article/document.html",
["foo.org/a/article/document.html"],
False,
),
( # Full URL since not in known URLs
"https://kiwix.org/a/article/document.html",
"https://foo.org/a/article/document.html",
"https://foo.org/a/article/document.html",
["kiwix.org/a/article/foo/"],
False,
),
( # all URLs rewrite activated
"https://kiwix.org/a/article/document.html",
"https://foo.org/a/article/document.html",
"../../../foo.org/a/article/document.html",
["kiwix.org/a/article/foo/"],
True,
),
( # puny-encoded host is transformed into url-encoded value
"https://kiwix.org/a/article/document.html",
"https://xn--exmple-cva.com/a/article/document.html",
"../../../ex%C3%A9mple.com/a/article/document.html",
["exémple.com/a/article/document.html"],
False,
),
( # host who should be puny-encoded is transformed into url-encoded value
"https://kiwix.org/a/article/document.html",
"https://exémple.com/a/article/document.html",
"../../../ex%C3%A9mple.com/a/article/document.html",
["exémple.com/a/article/document.html"],
False,
),
],
)
def test_absolute_url(
article_url,
know_paths,
original_content_url,
expected_rewriten_content_url,
rewrite_all_url,
):
article_url = HttpUrl(article_url)
rewriter = ArticleUrlRewriter(
article_url,
{ZimPath(path) for path in know_paths},
)
assert (
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
== expected_rewriten_content_url
)
@pytest.mark.parametrize(
"original_content_url, rewrite_all_url",
[
("data:0548datacontent", False),
("blob:exemple.com/url", False),
("mailto:bob@acme.com", False),
("tel:+33.1.12.12.23", False),
("data:0548datacontent", True),
("blob:exemple.com/url", True),
("mailto:bob@acme.com", True),
("tel:+33.1.12.12.23", True),
],
)
# other schemes are never rewritten, even when rewrite_all_url is true
def test_no_rewrite_other_schemes(original_content_url, rewrite_all_url):
article_url = HttpUrl("https://kiwix.org/a/article/document.html")
rewriter = ArticleUrlRewriter(
article_url,
set(),
)
assert (
rewriter(original_content_url, base_href=None, rewrite_all_url=rewrite_all_url)
== original_content_url
)
@pytest.mark.parametrize(
"original_content_url, know_path, base_href, expected_rewriten_content_url",
[
pytest.param(
"foo.html",
"kiwix.org/a/article/foo.html",
None,
"foo.html",
id="no_base",
),
pytest.param(
"foo.html",
"kiwix.org/a/foo.html",
"../",
"../foo.html",
id="parent_base",
),
pytest.param(
"foo.html",
"kiwix.org/a/bar/foo.html",
"../bar/",
"../bar/foo.html",
id="base_in_another_folder",
),
pytest.param(
"foo.html",
"www.example.com/foo.html",
"https://www.example.com/",
"../../../www.example.com/foo.html",
id="base_on_absolute_url",
),
],
)
def test_base_href(
original_content_url,
know_path,
base_href,
expected_rewriten_content_url,
):
rewriter = ArticleUrlRewriter(
HttpUrl("https://kiwix.org/a/article/document.html"),
{ZimPath(path) for path in [know_path]},
)
assert (
rewriter(original_content_url, base_href=base_href, rewrite_all_url=False)
== expected_rewriten_content_url
)

View file

@ -17,7 +17,6 @@ from zimscraperlib.zim import Archive
from warc2zim.__about__ import __version__
from warc2zim.converter import iter_warc_records
from warc2zim.main import main
from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
from warc2zim.utils import get_record_url
ZIM_ILLUSTRATION_SIZE = 48
@ -242,101 +241,6 @@ class TestWarc2Zim:
)
return dst.getvalue()
@pytest.mark.parametrize(
"url,zim_path",
[
("https://exemple.com", "exemple.com/"),
("https://exemple.com/", "exemple.com/"),
("http://example.com/resource", "example.com/resource"),
("http://example.com/resource/", "example.com/resource/"),
(
"http://example.com/resource/folder/sub.txt",
"example.com/resource/folder/sub.txt",
),
(
"http://example.com/resource/folder/sub",
"example.com/resource/folder/sub",
),
(
"http://example.com/resource/folder/sub?foo=bar",
"example.com/resource/folder/sub?foo=bar",
),
(
"http://example.com/resource/folder/sub?foo=bar#anchor1",
"example.com/resource/folder/sub?foo=bar",
),
("http://example.com/resource/#anchor1", "example.com/resource/"),
("http://example.com/resource/?foo=bar", "example.com/resource/?foo=bar"),
("http://example.com#anchor1", "example.com/"),
("http://example.com?foo=bar#anchor1", "example.com/?foo=bar"),
("http://example.com/?foo=bar", "example.com/?foo=bar"),
("http://example.com/?foo=ba+r", "example.com/?foo=ba r"),
(
"http://example.com/?foo=ba r",
"example.com/?foo=ba r",
), # situation where the ` ` has not been properly escaped in document
("http://example.com/?foo=ba%2Br", "example.com/?foo=ba+r"),
("http://example.com/?foo=ba+%2B+r", "example.com/?foo=ba + r"),
("http://example.com/#anchor1", "example.com/"),
(
"http://example.com/some/path/http://example.com//some/path",
"example.com/some/path/http:/example.com/some/path",
),
(
"http://example.com/some/pa?th/http://example.com//some/path",
"example.com/some/pa?th/http:/example.com/some/path",
),
(
"http://example.com/so?me/pa?th/http://example.com//some/path",
"example.com/so?me/pa?th/http:/example.com/some/path",
),
("http://example.com/resource?", "example.com/resource"),
("http://example.com/resource#", "example.com/resource"),
("http://user@example.com/resource", "example.com/resource"),
("http://user:password@example.com/resource", "example.com/resource"),
("http://example.com:8080/resource", "example.com/resource"),
(
"http://foobargooglevideo.com/videoplayback?id=1576&key=value",
"youtube.fuzzy.replayweb.page/videoplayback?id=1576",
), # Fuzzy rule is applied in addition to path transformations
("https://xn--exmple-cva.com", "exémple.com/"),
("https://xn--exmple-cva.com/", "exémple.com/"),
("https://xn--exmple-cva.com/resource", "exémple.com/resource"),
("https://exémple.com/", "exémple.com/"),
("https://exémple.com/resource", "exémple.com/resource"),
# host_ip is an invalid hostname according to spec
("https://host_ip/", "host_ip/"),
("https://host_ip/resource", "host_ip/resource"),
("https://192.168.1.1/", "192.168.1.1/"),
("https://192.168.1.1/resource", "192.168.1.1/resource"),
("http://example.com/res%24urce", "example.com/res$urce"),
(
"http://example.com/resource?foo=b%24r",
"example.com/resource?foo=b$r",
),
("http://example.com/resource@300x", "example.com/resource@300x"),
("http://example.com:8080/resource", "example.com/resource"),
("http://user@example.com:8080/resource", "example.com/resource"),
("http://user:password@example.com:8080/resource", "example.com/resource"),
# the two URI below are an illustration of a potential collision (two
# differents URI leading to the same ZIM path)
(
"http://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-"
"de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png",
"tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-"
"Solidarité-Numérique_1@300x.png",
),
(
"https://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-"
"de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png",
"tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-"
"Solidarité-Numérique_1@300x.png",
),
],
)
def test_normalize(self, url, zim_path):
assert normalize(HttpUrl(url)).value == ZimPath(zim_path).value
def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
zim_output = "zim-out-filename.zim"
main(