warc2zim/tests/test_warc_to_zim.py

514 lines
16 KiB
Python
Raw Normal View History

2020-08-11 03:41:33 +00:00
#!/usr/bin/env python
# vim: ai ts=4 sts=4 et sw=4 nu
2020-12-09 10:55:59 +00:00
import json
import os
Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`.
2023-11-14 16:12:17 +01:00
import re
import time
import pytest
2021-01-12 16:17:46 +00:00
import requests
from zimscraperlib.zim import Archive
from warc2zim.converter import iter_warc_records
2023-11-28 15:27:01 +01:00
from warc2zim.main import main
from warc2zim.url_rewriting import normalize
from warc2zim.utils import get_record_url
TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
# ============================================================================
CMDLINES = [
["example-response.warc"],
2020-12-09 10:55:59 +00:00
["example-response.warc", "--progress-file", "progress.json"],
["example-resource.warc.gz", "--favicon", "https://example.com/some/favicon.ico"],
["example-resource.warc.gz", "--favicon", "https://www.google.com/favicon.ico"],
["example-revisit.warc.gz"],
[
"example-revisit.warc.gz",
"-u",
"http://example.iana.org/",
"--lang",
"eng",
],
2020-08-02 23:04:32 +00:00
[
"example-utf8.warc",
"-u",
"https://httpbin.org/anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93",
],
["single-page-test.warc"],
]
@pytest.fixture(params=CMDLINES, ids=[" ".join(cmds) for cmds in CMDLINES])
def cmdline(request):
return request.param
# ============================================================================
FUZZYCHECKS = [
{
"filename": "video-yt.warc.gz",
"entries": [
Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`.
2023-11-14 16:12:17 +01:00
"youtube.fuzzy.replayweb.page/get_video_info?video_id=aT-Up5Y4uRI",
"youtube.fuzzy.replayweb.page/videoplayback?id=o-AE3bg3qVNY-gAWwYgL52vgpHKJe9ijdbu2eciNi5Uo_w",
2021-10-29 16:55:45 +00:00
],
},
{
"filename": "video-yt-2.warc.gz",
"entries": [
Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`.
2023-11-14 16:12:17 +01:00
"youtube.fuzzy.replayweb.page/youtubei/v1/player?videoId=aT-Up5Y4uRI",
"youtube.fuzzy.replayweb.page/videoplayback?id=o-AGDtIqpFRmvgVVZk96wgGyFxL_SFSdpBxs0iBHatQpRD",
2021-10-29 16:55:45 +00:00
],
},
{
"filename": "video-vimeo.warc.gz",
"entries": [
Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`.
2023-11-14 16:12:17 +01:00
"vimeo.fuzzy.replayweb.page/video/347119375",
"vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4",
2021-10-29 16:55:45 +00:00
],
},
]
@pytest.fixture(params=FUZZYCHECKS, ids=[fuzzy["filename"] for fuzzy in FUZZYCHECKS])
def fuzzycheck(request):
return request.param
# ============================================================================
class TestWarc2Zim:
def list_articles(self, zimfile):
zim_fh = Archive(zimfile)
for x in range(zim_fh.entry_count):
yield zim_fh.get_entry_by_id(x)
def get_metadata(self, zimfile, name):
zim_fh = Archive(zimfile)
return zim_fh.get_metadata(name)
def get_article(self, zimfile, path):
zim_fh = Archive(zimfile)
return zim_fh.get_content(path)
def get_article_raw(self, zimfile, path):
zim_fh = Archive(zimfile)
return zim_fh.get_item(path)
def verify_warc_and_zim(self, warcfile, zimfile):
assert os.path.isfile(warcfile)
assert os.path.isfile(zimfile)
# [TOFIX]
head_insert = b""
# track to avoid checking duplicates, which are not written to ZIM
warc_urls = set()
zim_fh = Archive(zimfile)
for record in iter_warc_records([warcfile]):
url = get_record_url(record)
if not url:
continue
if url in warc_urls:
continue
if record.rec_type not in (("response", "resource", "revisit")):
continue
# ignore revisit records that are to the same url
if (
record.rec_type == "revisit"
and record.rec_headers["WARC-Refers-To-Target-URI"] == url
):
continue
# parse headers as record, ensure headers match
url_no_scheme = url.split("//", 2)[1]
Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`.
2023-11-14 16:12:17 +01:00
if "www.youtube.com/embed" in url_no_scheme:
# We know that those url are rewritten in zim. Don't check for them.
break
url_no_scheme = re.sub(r"\?\d+$", "?", url_no_scheme)
# ensure payloads match
try:
payload = zim_fh.get_item(url_no_scheme)
except KeyError:
payload = None
if record.http_headers and record.http_headers.get("Content-Length") == "0":
assert not payload
elif record.rec_type == "revisit":
# We must have a payload
# We should check with the content of the targeted record...
# But difficult to test as we don't have it
assert payload
else:
# We must have a payload
assert payload
payload_content = payload.content.tobytes()
# if HTML, still need to account for the head insert, otherwise should
# have exact match
if payload.mimetype.startswith("text/html"):
assert head_insert in payload_content
warc_urls.add(url)
Introduce `normalize` and normalization schema. Properly define how we store entries in zim file. We introduce `normalize` helper function class in place of `canonicalize`. We work on normalization on converter level. So we path the path to the items instead of letting them call `normalize`. Converted `test/data/video-vimeo.warc.gz` to zim was containing : ``` A/404.html A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js A/f.vimeocdn.com/p/3.45.3/css/player.css A/f.vimeocdn.com/p/3.45.3/js/player.js A/i.vimeocdn.com/player/354746.png?mw=200&mh=200 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 A/index.html A/load.js A/oembed.link/favicon.ico A/oembed.link/https://vimeo.com/347119375 A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 A/sw.js A/topFrame.html A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 H/vimeo.fuzzy.replayweb.page/video/347119375 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` With this change it contains: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ```
2023-11-14 16:00:08 +01:00
def test_normalize(self):
2024-01-25 17:44:17 +01:00
assert normalize(None) is None
Introduce `normalize` and normalization schema. Properly define how we store entries in zim file. We introduce `normalize` helper function class in place of `canonicalize`. We work on normalization on converter level. So we path the path to the items instead of letting them call `normalize`. Converted `test/data/video-vimeo.warc.gz` to zim was containing : ``` A/404.html A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js A/f.vimeocdn.com/p/3.45.3/css/player.css A/f.vimeocdn.com/p/3.45.3/js/player.js A/i.vimeocdn.com/player/354746.png?mw=200&mh=200 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 A/index.html A/load.js A/oembed.link/favicon.ico A/oembed.link/https://vimeo.com/347119375 A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 A/sw.js A/topFrame.html A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 H/vimeo.fuzzy.replayweb.page/video/347119375 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` With this change it contains: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ```
2023-11-14 16:00:08 +01:00
assert normalize("") == ""
assert normalize("https://exemple.com") == "exemple.com"
assert normalize("https://exemple.com/") == "exemple.com/"
assert normalize("http://example.com/?foo=bar") == "example.com/?foo=bar"
assert normalize(b"http://example.com/?foo=bar") == "example.com/?foo=bar"
Introduce `normalize` and normalization schema. Properly define how we store entries in zim file. We introduce `normalize` helper function class in place of `canonicalize`. We work on normalization on converter level. So we path the path to the items instead of letting them call `normalize`. Converted `test/data/video-vimeo.warc.gz` to zim was containing : ``` A/404.html A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js A/f.vimeocdn.com/p/3.45.3/css/player.css A/f.vimeocdn.com/p/3.45.3/js/player.js A/i.vimeocdn.com/player/354746.png?mw=200&mh=200 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 A/index.html A/load.js A/oembed.link/favicon.ico A/oembed.link/https://vimeo.com/347119375 A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 A/sw.js A/topFrame.html A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 H/vimeo.fuzzy.replayweb.page/video/347119375 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` With this change it contains: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ```
2023-11-14 16:00:08 +01:00
assert normalize("https://example.com/?foo=bar") == "example.com/?foo=bar"
assert (
Introduce `normalize` and normalization schema. Properly define how we store entries in zim file. We introduce `normalize` helper function class in place of `canonicalize`. We work on normalization on converter level. So we path the path to the items instead of letting them call `normalize`. Converted `test/data/video-vimeo.warc.gz` to zim was containing : ``` A/404.html A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js A/f.vimeocdn.com/p/3.45.3/css/player.css A/f.vimeocdn.com/p/3.45.3/js/player.js A/i.vimeocdn.com/player/354746.png?mw=200&mh=200 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 A/index.html A/load.js A/oembed.link/favicon.ico A/oembed.link/https://vimeo.com/347119375 A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 A/sw.js A/topFrame.html A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 H/vimeo.fuzzy.replayweb.page/video/347119375 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` With this change it contains: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ```
2023-11-14 16:00:08 +01:00
normalize("https://example.com/some/path/http://example.com/?foo=bar")
== "example.com/some/path/http://example.com/?foo=bar"
)
assert (
Introduce `normalize` and normalization schema. Properly define how we store entries in zim file. We introduce `normalize` helper function class in place of `canonicalize`. We work on normalization on converter level. So we path the path to the items instead of letting them call `normalize`. Converted `test/data/video-vimeo.warc.gz` to zim was containing : ``` A/404.html A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js A/f.vimeocdn.com/p/3.45.3/css/player.css A/f.vimeocdn.com/p/3.45.3/js/player.js A/i.vimeocdn.com/player/354746.png?mw=200&mh=200 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 A/index.html A/load.js A/oembed.link/favicon.ico A/oembed.link/https://vimeo.com/347119375 A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 A/sw.js A/topFrame.html A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 H/vimeo.fuzzy.replayweb.page/video/347119375 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` With this change it contains: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ```
2023-11-14 16:00:08 +01:00
normalize("example.com/some/path/http://example.com/?foo=bar")
== "example.com/some/path/http://example.com/?foo=bar"
)
Introduce `normalize` and normalization schema. Properly define how we store entries in zim file. We introduce `normalize` helper function class in place of `canonicalize`. We work on normalization on converter level. So we path the path to the items instead of letting them call `normalize`. Converted `test/data/video-vimeo.warc.gz` to zim was containing : ``` A/404.html A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js A/f.vimeocdn.com/p/3.45.3/css/player.css A/f.vimeocdn.com/p/3.45.3/js/player.js A/i.vimeocdn.com/player/354746.png?mw=200&mh=200 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 A/index.html A/load.js A/oembed.link/favicon.ico A/oembed.link/https://vimeo.com/347119375 A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 A/sw.js A/topFrame.html A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 H/vimeo.fuzzy.replayweb.page/video/347119375 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` With this change it contains: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ```
2023-11-14 16:00:08 +01:00
assert (
normalize("http://example.com/path/with/final/slash/")
== "example.com/path/with/final/slash/"
)
assert normalize("http://test@example.com/") == "test@example.com/"
assert (
normalize(
"http://lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493"
)
Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`.
2023-11-14 16:12:17 +01:00
== "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?"
Introduce `normalize` and normalization schema. Properly define how we store entries in zim file. We introduce `normalize` helper function class in place of `canonicalize`. We work on normalization on converter level. So we path the path to the items instead of letting them call `normalize`. Converted `test/data/video-vimeo.warc.gz` to zim was containing : ``` A/404.html A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js A/f.vimeocdn.com/p/3.45.3/css/player.css A/f.vimeocdn.com/p/3.45.3/js/player.js A/i.vimeocdn.com/player/354746.png?mw=200&mh=200 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 A/index.html A/load.js A/oembed.link/favicon.ico A/oembed.link/https://vimeo.com/347119375 A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 A/sw.js A/topFrame.html A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 H/vimeo.fuzzy.replayweb.page/video/347119375 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` With this change it contains: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ```
2023-11-14 16:00:08 +01:00
)
def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
zim_output = "zim-out-filename.zim"
2023-11-28 15:27:01 +01:00
main(
[
"-v",
os.path.join(TEST_DATA_DIR, "example-response.warc"),
"--name",
"example-response",
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--tags",
"some",
"--tags",
"foo",
"--desc",
"test zim",
"--tags",
"bar",
"--title",
"Some Title",
]
)
zim_output = tmp_path / zim_output
assert os.path.isfile(zim_output)
all_articles = {
article.path: article.title for article in self.list_articles(zim_output)
}
assert all_articles == {
# entries from WARC
"example.com/": "Example Domain",
"_zim_static/__wb_module_decl.js": "_zim_static/__wb_module_decl.js",
"_zim_static/wombat.js": "_zim_static/wombat.js",
"_zim_static/wombat_setup.js": "_zim_static/wombat_setup.js",
}
zim_fh = Archive(zim_output)
# ZIM metadata
assert list(zim_fh.metadata.keys()) == [
"Counter",
"Creator",
"Date",
"Description",
"Language",
"Name",
"Publisher",
"Scraper",
"Tags",
"Title",
]
assert zim_fh.has_fulltext_index
assert zim_fh.has_title_index
assert self.get_metadata(zim_output, "Description") == b"test zim"
assert (
self.get_metadata(zim_output, "Tags")
2024-01-25 14:07:31 +01:00
== b"_ftindex:yes;_category:other;some;foo;bar"
)
assert self.get_metadata(zim_output, "Title") == b"Some Title"
def test_warc_to_zim(self, cmdline, tmp_path):
# intput filename
filename = cmdline[0]
# set intput filename (first arg) to absolute path from test dir
warcfile = os.path.join(TEST_DATA_DIR, filename)
cmdline[0] = warcfile
cmdline.extend(["--output", str(tmp_path), "--name", filename])
2023-11-28 15:27:01 +01:00
main(cmdline)
zimfile = filename + "_" + time.strftime("%Y-%m") + ".zim"
2020-12-09 10:55:59 +00:00
if "--progress-file" in cmdline:
with open(tmp_path / "progress.json") as fh:
2020-12-09 10:55:59 +00:00
progress = json.load(fh)
assert (
progress["written"] > 0
and progress["total"] > 0
and progress["written"] <= progress["total"]
)
self.verify_warc_and_zim(warcfile, tmp_path / zimfile)
def test_same_domain_only(self, tmp_path):
zim_output = "same-domain.zim"
2023-11-28 15:27:01 +01:00
main(
[
os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"),
"--favicon",
"http://example.com/favicon.ico",
"--include-domains",
"example.com/",
"--lang",
"eng",
"--zim-file",
zim_output,
"--name",
"same-domain",
"--output",
str(tmp_path),
]
)
zim_output = tmp_path / zim_output
for article in self.list_articles(zim_output):
url = article.path
# ignore the replay files, which have only one path segment
if not url.startswith("_zim_static/"):
assert url.startswith("example.com/")
def test_skip_self_redirect(self, tmp_path):
zim_output = "self-redir.zim"
2023-11-28 15:27:01 +01:00
main(
[
os.path.join(TEST_DATA_DIR, "self-redirect.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"self-redir",
]
)
zim_output = tmp_path / zim_output
def test_include_domains_favicon_and_language(self, tmp_path):
zim_output = "spt.zim"
2023-11-28 15:27:01 +01:00
main(
[
os.path.join(TEST_DATA_DIR, "single-page-test.warc"),
"-i",
"reseau-canope.fr",
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"spt",
]
)
zim_output = tmp_path / zim_output
for article in self.list_articles(zim_output):
url = article.path
# ignore the replay files, which have only one path segment
if not url.startswith("_zim_static/"):
assert "reseau-canope.fr/" in url
# test detected language
assert self.get_metadata(zim_output, "Language") == b"fra"
# test detected favicon
assert self.get_article(
zim_output,
"lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico",
)
assert self.get_metadata(zim_output, "Illustration_48x48@1")
# test default tags added
2024-01-25 14:07:31 +01:00
assert self.get_metadata(zim_output, "Tags") == b"_ftindex:yes;_category:other"
def test_all_warcs_root_dir(self, tmp_path):
zim_output = "test-all.zim"
2023-11-28 15:27:01 +01:00
main(
[
os.path.join(TEST_DATA_DIR),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"test-all",
"--url",
"http://example.com",
]
)
zim_output = tmp_path / zim_output
# check articles from different warc records in tests/data dir
# from example.warc.gz
assert self.get_article(zim_output, "example.com/") != b""
# from single-page-test.warc
assert (
self.get_article(
zim_output, "lesfondamentaux.reseau-canope.fr/accueil.html"
)
!= b""
)
# timestamp fuzzy match from example-with-timestamp.warc
Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`.
2023-11-14 16:12:17 +01:00
assert self.get_article(zim_output, "example.com/path.txt?") != b""
def test_fuzzy_urls(self, tmp_path, fuzzycheck):
zim_output = fuzzycheck["filename"] + ".zim"
2023-11-28 15:27:01 +01:00
main(
[
os.path.join(TEST_DATA_DIR, fuzzycheck["filename"]),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"test-fuzzy",
]
)
zim_output = tmp_path / zim_output
2021-10-29 02:20:01 +00:00
for entry in fuzzycheck["entries"]:
# This should be item and get_article_raw is eq to getItem and it will fail
# if it is not a item
Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`.
2023-11-14 16:12:17 +01:00
self.get_article_raw(zim_output, entry)
2021-10-29 02:20:01 +00:00
def test_error_bad_main_page(self, tmp_path):
zim_output_not_created = "zim-out-not-created.zim"
2024-01-25 16:23:59 +01:00
with pytest.raises(
KeyError,
match="Unable to find WARC record for main page: no-such-url.example.com/,"
" aborting",
):
2023-11-28 15:27:01 +01:00
main(
[
"-v",
os.path.join(TEST_DATA_DIR, "example-response.warc"),
"-u",
"https://no-such-url.example.com",
"--output",
str(tmp_path),
"--name",
"bad",
"--zim-file",
zim_output_not_created,
]
)
def test_args_only(self):
# error, name required
with pytest.raises(SystemExit) as e:
2023-11-28 15:27:01 +01:00
main([])
assert e.value.code == 2
# error, no such output directory
2024-01-25 16:32:53 +01:00
with pytest.raises(
FileNotFoundError, match="No such file or directory.*/no-such-dir"
):
2023-11-28 15:27:01 +01:00
main(["--name", "test", "--output", "/no-such-dir"])
# success, special error code for no output files
2023-11-28 15:27:01 +01:00
assert main(["--name", "test", "--output", "./"]) == 100
2021-01-12 16:17:46 +00:00
def test_custom_css(self, tmp_path):
custom_css = b"* { background-color: red; }"
custom_css_path = tmp_path / "custom.css"
with open(custom_css_path, "wb") as fh:
fh.write(custom_css)
zim_output = "test-css.zim"
2023-11-28 15:27:01 +01:00
main(
2021-01-12 16:17:46 +00:00
[
os.path.join(TEST_DATA_DIR, "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"test-css",
"--custom-css",
str(custom_css_path),
]
)
zim_output = tmp_path / zim_output
res = self.get_article(zim_output, "example.com/")
assert b"warc2zim.kiwix.app/custom.css" in res
2021-01-12 16:17:46 +00:00
res = self.get_article(zim_output, "warc2zim.kiwix.app/custom.css")
2021-01-12 16:17:46 +00:00
assert custom_css == res
def test_custom_css_remote(self, tmp_path):
zim_output = "test-css.zim"
url = (
"https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/css/bootstrap-reboot.css"
)
2023-11-28 15:27:01 +01:00
main(
2021-01-12 16:17:46 +00:00
[
os.path.join(TEST_DATA_DIR, "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"test-css",
"--custom-css",
url,
]
)
zim_output = tmp_path / zim_output
res = self.get_article(zim_output, "example.com/")
assert b"warc2zim.kiwix.app/custom.css" in res
2021-01-12 16:17:46 +00:00
res = self.get_article(zim_output, "warc2zim.kiwix.app/custom.css")
assert res == requests.get(url, timeout=10).content