mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 04:23:15 +00:00
Compare commits
447 commits
proof-of-c
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a7e236f0d7 | ||
|
|
81018f06fa | ||
|
|
34ce7eb98d | ||
|
|
5bb068ffea | ||
|
|
aec19d95d2 | ||
|
|
277473884e | ||
|
|
e30a82a91c | ||
|
|
ef004f3863 | ||
|
|
6db73a0a83 | ||
|
|
57a88434e2 | ||
|
|
4595d2a302 | ||
|
|
611d2033f7 | ||
|
|
00845293d6 | ||
|
|
44cf4218cb | ||
|
|
6b520318a2 | ||
|
|
a9805c84c2 | ||
|
|
8630b87a1f | ||
|
|
ad09665c4a | ||
|
|
1d2069a66b | ||
|
|
4ec47cd6dd | ||
|
|
b60dd388e7 | ||
|
|
5624cbf081 | ||
|
|
8c471d9ee2 | ||
|
|
009b8b4bd6 | ||
|
|
0c795b0051 | ||
|
|
b5d87198d8 | ||
|
|
511c3a5021 | ||
|
|
3421ca0212 | ||
|
|
12fde3af98 | ||
|
|
6c006d9a4d | ||
|
|
146af5de0a | ||
|
|
4e0174836d | ||
|
|
1e6748ab69 | ||
|
|
5d319cdc09 | ||
|
|
88b85311e0 | ||
|
|
9fc66a95b7 | ||
|
|
6ee053af5f | ||
|
|
ac4ba0b01e | ||
|
|
1287351c1d | ||
|
|
b85a6b7e4e | ||
|
|
eebc75f868 | ||
|
|
00f0e475ae | ||
|
|
363ff40767 | ||
|
|
dd65902556 | ||
|
|
3e2ad5fede | ||
|
|
5e53be6fa4 | ||
|
|
1b5b9bb80b | ||
|
|
bce22ceac1 | ||
|
|
e3cd12b0d1 | ||
|
|
ee0f4c6cec | ||
|
|
91d5edda4a | ||
|
|
3eb6c09046 | ||
|
|
a9efec4797 | ||
|
|
2f7a83e187 | ||
|
|
96c4c3bdfd | ||
|
|
7bfb4b25f0 | ||
|
|
ed1a8a0aa9 | ||
|
|
dc6b5aafb7 | ||
|
|
4f9085b10e | ||
|
|
b4ec60f316 | ||
|
|
ee82837aaa | ||
|
|
bc73193ce0 | ||
|
|
101fb71a0b | ||
|
|
3a7f583a96 | ||
|
|
8b4b18bfb7 | ||
|
|
d228e9f346 | ||
|
|
2e48ea1af6 | ||
|
|
a7e1026b2e | ||
|
|
cc84848c32 | ||
|
|
6ec53f774f | ||
|
|
5af981c01c | ||
|
|
b4c0495f48 | ||
|
|
cea10bd3b5 | ||
|
|
4ef9a0d380 | ||
|
|
bf0dcd2ffc | ||
|
|
9396cf1ca0 | ||
|
|
0f136d2f2f | ||
|
|
0cb84f2126 | ||
|
|
4835adbdd7 | ||
|
|
14670d4c69 | ||
|
|
8cddcf0666 | ||
|
|
97ea6dfd7b | ||
|
|
8d42a8dd93 | ||
|
|
00d2433383 | ||
|
|
b5dac3c309 | ||
|
|
16a4f8d4d8 | ||
|
|
e9adc38856 | ||
|
|
bfa226bf81 | ||
|
|
15b72022ce | ||
|
|
ac3975550d | ||
|
|
800142d66f | ||
|
|
5869a6a1c9 | ||
|
|
1307285641 | ||
|
|
c81c148207 | ||
|
|
4c1007cb94 | ||
|
|
06693c9d86 | ||
|
|
1de457a478 | ||
|
|
68c2379b39 | ||
|
|
a95dabba69 | ||
|
|
9f2101e541 | ||
|
|
2a3b4f62fd | ||
|
|
02d62b30ee | ||
|
|
ef8023a1a9 | ||
|
|
ea320a48c6 | ||
|
|
d50bcb0b0d | ||
|
|
e8faebbb24 | ||
|
|
0e6919300b | ||
|
|
9bf3ea4ea3 | ||
|
|
fa6c128379 | ||
|
|
f17e15287e | ||
|
|
b056c6dc4f | ||
|
|
5fe6539201 | ||
|
|
01187be0f3 | ||
|
|
eeac5014f1 | ||
|
|
1c30abd39d | ||
|
|
3e2ddd1708 | ||
|
|
6d5fc0bed0 | ||
|
|
ca86c8c7cd | ||
|
|
1c9d927438 | ||
|
|
5e3c731fb2 | ||
|
|
113eeebf9c | ||
|
|
6a804e9a8e | ||
|
|
501520d07f | ||
|
|
e0d1adf676 | ||
|
|
7873667434 | ||
|
|
a1329974a1 | ||
|
|
6b3c725eeb | ||
|
|
7f76415710 | ||
|
|
37c4beda6a | ||
|
|
ef12d01958 | ||
|
|
d814c23178 | ||
|
|
efdf7804c0 | ||
|
|
d0d0c6e6e6 | ||
|
|
be1e2d6745 | ||
|
|
f7df467eab | ||
|
|
af48be8f82 | ||
|
|
7e69d8ab75 | ||
|
|
2e082c41a9 | ||
|
|
ad5adcd096 | ||
|
|
bc06e85ced | ||
|
|
a0f802099a | ||
|
|
eb32adfea7 | ||
|
|
0d5a08c912 | ||
|
|
8cd1db6eef | ||
|
|
459a30a226 | ||
|
|
861751a7ed | ||
|
|
1ea533c75f | ||
|
|
6d078c4dcf | ||
|
|
751e10473a | ||
|
|
f756c2c652 | ||
|
|
097613de29 | ||
|
|
4c35836395 | ||
|
|
6e3951dfa7 | ||
|
|
ea7653ef37 | ||
|
|
80b6b26782 | ||
|
|
6ab3401fa2 | ||
|
|
a1efe8dccf | ||
|
|
526019e095 | ||
|
|
2452e60d9d | ||
|
|
dee57a8dd8 | ||
|
|
c92782bea0 | ||
|
|
7305f70300 | ||
|
|
021654e6b3 | ||
|
|
7357b1f2ce | ||
|
|
8a64216ac0 | ||
|
|
9d43636559 | ||
|
|
52e225619e | ||
|
|
3dc7327fb0 | ||
|
|
dcd6427b8a | ||
|
|
fbd01a77ce | ||
|
|
24fdbe19d9 | ||
|
|
636a6a6d28 | ||
|
|
91a53f70ec | ||
|
|
e8995a9f59 | ||
|
|
4265effe91 | ||
|
|
2be5650a8c | ||
|
|
de0720e301 | ||
|
|
b73a3e04d0 | ||
|
|
2f50db874d | ||
|
|
68f2ed14d6 | ||
|
|
baa0d9ecc7 | ||
|
|
2835c7b078 | ||
|
|
e6a6560b85 | ||
|
|
77747ec1d3 | ||
|
|
c67ccb9528 | ||
|
|
83690f410d | ||
|
|
d8e6d55f87 | ||
|
|
ae6e5ffaf6 | ||
|
|
59057bdbb1 | ||
|
|
7806aeba63 | ||
|
|
936666917c | ||
|
|
957e52c57f | ||
|
|
36f2fa5f2b | ||
|
|
8fdad5954e | ||
|
|
9e6c998816 | ||
|
|
4cf6e01669 | ||
|
|
ce49a5d4e9 | ||
|
|
1d54b20873 | ||
|
|
9a7415a402 | ||
|
|
d54aa22bb2 | ||
|
|
f637c3fccc | ||
|
|
728784d6bf | ||
|
|
e24479945f | ||
|
|
3070fe9724 | ||
|
|
54732692ac | ||
|
|
867d14fd00 | ||
|
|
5c716747b4 | ||
|
|
456219deb3 | ||
|
|
a9769b2871 | ||
|
|
a4cb27a793 | ||
|
|
4d31f8eabb | ||
|
|
b69f3d610f | ||
|
|
c2dc8c5ccc | ||
|
|
857ae5674d | ||
|
|
89aea6b41e | ||
|
|
a44c1a7c7f | ||
|
|
6ca9be48c7 | ||
|
|
01c5833c29 | ||
|
|
7caa355c31 | ||
|
|
49da57c5b6 | ||
|
|
9244f2e69c | ||
|
|
ef462b5024 | ||
|
|
f4359022b2 | ||
|
|
a505df9fe0 | ||
|
|
343d0040cf | ||
|
|
c7fdc1d11e | ||
|
|
c0ffb74d8c | ||
|
|
343fb7e770 | ||
|
|
909b6e3da8 | ||
|
|
f46f2568ff | ||
|
|
19b4898326 | ||
|
|
10471c1ea9 | ||
|
|
eebf26f7cb | ||
|
|
27f9dcc53f | ||
|
|
22551388e0 | ||
|
|
a352c0c402 | ||
|
|
e034b08852 | ||
|
|
1c58bbe303 | ||
|
|
eab3d1f189 | ||
|
|
bbc8a48bc9 | ||
|
|
7bc0ed9c02 | ||
|
|
af0c93f1df | ||
|
|
cd6a55b179 | ||
|
|
f80dbd11d9 | ||
|
|
a62f31ed0d | ||
|
|
d6c0c6ce63 | ||
|
|
a2b4c71ec9 | ||
|
|
b98e8f7027 | ||
|
|
79d5f8bc7b | ||
|
|
216ac09d8c | ||
|
|
51ef841836 | ||
|
|
6e6c0e8b39 | ||
|
|
7ca08791e7 | ||
|
|
5512e814c7 | ||
|
|
60b970f844 | ||
|
|
99ca5ca901 | ||
|
|
51d0409128 | ||
|
|
4ad41a7d54 | ||
|
|
d24775d70c | ||
|
|
a73114d140 | ||
|
|
c98e4505a8 | ||
|
|
9e9140690d | ||
|
|
31a652c8dd | ||
|
|
36ba61b0a5 | ||
|
|
24a250f0ee | ||
|
|
56fb86e531 | ||
|
|
e0a4d3ffef | ||
|
|
b89f57b843 | ||
|
|
18ed6ca540 | ||
|
|
d487d658a4 | ||
|
|
2a317c91e4 | ||
|
|
f22bb9218c | ||
|
|
d8f6cef7f3 | ||
|
|
00051453e1 | ||
|
|
3769c77cd4 | ||
|
|
df2403c6dd | ||
|
|
2be5562553 | ||
|
|
ea210bcd10 | ||
|
|
da055a828d | ||
|
|
7e24388820 | ||
|
|
12dab25e61 | ||
|
|
951241d8bf | ||
|
|
df0fa9bbaf | ||
|
|
c9c7e7a26f | ||
|
|
1224476b41 | ||
|
|
e590e851be | ||
|
|
906161ea51 | ||
|
|
cbaaa77a1f | ||
|
|
7cb118eaeb | ||
|
|
722306d3bf | ||
|
|
61dc792653 | ||
|
|
941db5fdfc | ||
|
|
47ede96f91 | ||
|
|
95c27bad08 | ||
|
|
57e2f41439 | ||
|
|
b568848a98 | ||
|
|
af8196095d | ||
|
|
70a80681a6 | ||
|
|
8d287466bd | ||
|
|
fc9ad3759e | ||
|
|
c31e80608e | ||
|
|
8b4ea950a8 | ||
|
|
8ecd0a3210 | ||
|
|
4f676e37c7 | ||
|
|
b7265b49b6 | ||
|
|
b8714d1260 | ||
|
|
6324b7c7c5 | ||
|
|
238d1a6016 | ||
|
|
79d444e7ea | ||
|
|
64bc8bf09f | ||
|
|
459778d472 | ||
|
|
af9a3d24d9 | ||
|
|
4b7e504d99 | ||
|
|
554fff5c87 | ||
|
|
8fd9462e25 | ||
|
|
0172c53c50 | ||
|
|
3756c6612f | ||
|
|
511fccdc56 | ||
|
|
859e79c165 | ||
|
|
cf26f8c33a | ||
|
|
0624c50121 | ||
|
|
fab4ff6bf5 | ||
|
|
a9cf1cd9c3 | ||
|
|
2d4375fd0a | ||
|
|
472c4cf41a | ||
|
|
ce68493087 | ||
|
|
857e044c84 | ||
|
|
8c6d2bfb45 | ||
|
|
b79ad1b138 | ||
|
|
142970bc0a | ||
|
|
b29aeb08e6 | ||
|
|
0eeb2ad9e3 | ||
|
|
dffc81860e | ||
|
|
e32aac3ec0 | ||
|
|
b2bb77cd65 | ||
|
|
932f97c999 | ||
|
|
1f490ace8f | ||
|
|
8b5eeb31c7 | ||
|
|
acf0aaf552 | ||
|
|
823e6bbb01 | ||
|
|
e29b6f3ad6 | ||
|
|
885e1763a1 | ||
|
|
80f3d3293f | ||
|
|
0025901959 | ||
|
|
99f8fbafe1 | ||
|
|
3d3f4fb121 | ||
|
|
8bcd692462 | ||
|
|
1f31d6c1a5 | ||
|
|
98587045b4 | ||
|
|
efd8ca53b4 | ||
|
|
14ced5c481 | ||
|
|
2e9c129523 | ||
|
|
03abf6050a | ||
|
|
f746f7b020 | ||
|
|
14fc8ffe0f | ||
|
|
ae820472de | ||
|
|
cfa4b0e7f8 | ||
|
|
964746481f | ||
|
|
69892a215f | ||
|
|
6da4714cff | ||
|
|
d0d51539fe | ||
|
|
c3a7a02121 | ||
|
|
76c92bdb4c | ||
|
|
610ecc7e5c | ||
|
|
a60f7a392f | ||
|
|
871f7ab58d | ||
|
|
e91cd7921e | ||
|
|
f4c11dc948 | ||
|
|
01302d3885 | ||
|
|
f72caad35c | ||
|
|
71603f8a15 | ||
|
|
ff5c6b3dc9 | ||
|
|
0cb3db6f16 | ||
|
|
508286ef78 | ||
|
|
56d319ce3f | ||
|
|
f6d44314cd | ||
|
|
eb5ca99bfb | ||
|
|
85fad62b61 | ||
|
|
3ffa34d46e | ||
|
|
b9ed1d00a2 | ||
|
|
5084c54af6 | ||
|
|
9422defe86 | ||
|
|
c0bb0503b8 | ||
|
|
a801a1eef6 | ||
|
|
4723376ebc | ||
|
|
5e4b3d80b3 | ||
|
|
82f0fae959 | ||
|
|
a930542af8 | ||
|
|
0e3af5124b | ||
|
|
0082d313ae | ||
|
|
568068ecfc | ||
|
|
989567e05e | ||
|
|
5b640f2f8b | ||
|
|
88a280bc58 | ||
|
|
6b5dbd20cb | ||
|
|
c228c8300c | ||
|
|
f6282dbf14 | ||
|
|
ae9aba7a00 | ||
|
|
8ceabce0e9 | ||
|
|
a425cd6956 | ||
|
|
91fe76c56e | ||
|
|
c6f27f3bf6 | ||
|
|
ab4e2e1a14 | ||
|
|
d8e313c492 | ||
|
|
904c95963c | ||
|
|
fb2232d8b1 | ||
|
|
2e2db2f352 | ||
|
|
5b3101f2d8 | ||
|
|
d9ba7e246f | ||
|
|
65b3b533b7 | ||
|
|
2c1b401e93 | ||
|
|
c26fe5d4cd | ||
|
|
9f62f52a02 | ||
|
|
9046f21f53 | ||
|
|
1198d1b6b9 | ||
|
|
0a494ee168 | ||
|
|
bd8d90efa5 | ||
|
|
e608cbd71e | ||
|
|
9e9ec82ad1 | ||
|
|
2b2f96d983 | ||
|
|
736deccbb5 | ||
|
|
901729a069 | ||
|
|
8842bf7048 | ||
|
|
1eee9ab633 | ||
|
|
880165ce91 | ||
|
|
94f0b7362d | ||
|
|
3519d32ba6 | ||
|
|
24c843c4af | ||
|
|
daa2492655 | ||
|
|
e4128c8183 | ||
|
|
bb5b7e48c1 | ||
|
|
252516e38c | ||
|
|
ac650bff05 | ||
|
|
01f2471ab8 | ||
|
|
71e94914aa | ||
|
|
6a925748d5 | ||
|
|
f25b390f15 | ||
|
|
f252245983 | ||
|
|
b00c4262a7 | ||
|
|
ff2773677c | ||
|
|
9b23de828b | ||
|
|
4e04645e6b | ||
|
|
1de577bd78 | ||
|
|
7346527a81 | ||
|
|
bdfd9be399 | ||
|
|
15cf636ff3 | ||
|
|
d178431e20 |
44 changed files with 4983 additions and 8310 deletions
4
.github/FUNDING.yml
vendored
4
.github/FUNDING.yml
vendored
|
|
@ -1,6 +1,6 @@
|
|||
# These are supported funding model platforms
|
||||
|
||||
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
patreon: # Replace with a single Patreon username
|
||||
open_collective: # Replace with a single Open Collective username
|
||||
ko_fi: # Replace with a single Ko-fi username
|
||||
|
|
@ -9,4 +9,4 @@ community_bridge: # Replace with a single Community Bridge project-name e.g., cl
|
|||
liberapay: # Replace with a single Liberapay username
|
||||
issuehunt: # Replace with a single IssueHunt username
|
||||
otechie: # Replace with a single Otechie username
|
||||
custom: https://kiwix.org/support-us/
|
||||
custom: # https://kiwix.org/support-us/
|
||||
|
|
|
|||
15
.github/stale.yml
vendored
Normal file
15
.github/stale.yml
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
daysUntilClose: false
|
||||
staleLabel: stale
|
||||
|
||||
issues:
|
||||
daysUntilStale: 60
|
||||
markComment: >
|
||||
This issue has been automatically marked as stale because it has not had
|
||||
recent activity. It will be now be reviewed manually. Thank you
|
||||
for your contributions.
|
||||
pulls:
|
||||
daysUntilStale: 7
|
||||
markComment: >
|
||||
This pull request has been automatically marked as stale because it has not had
|
||||
recent activity. It will be now be reviewed manually. Thank you
|
||||
for your contributions.
|
||||
34
.github/workflows/DailyTests.yaml
vendored
Normal file
34
.github/workflows/DailyTests.yaml
vendored
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
name: DailyTests
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 4 * * *"
|
||||
workflow_dispatch:
|
||||
|
||||
|
||||
jobs:
|
||||
run-daily-tests:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: build zimit image
|
||||
run: docker build -t local-zimit .
|
||||
|
||||
- name: run crawl of test website
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
|
||||
|
||||
- name: archive ZIM
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: tests_eng_test-website.zim
|
||||
path: output/tests_eng_test-website.zim
|
||||
retention-days: 30
|
||||
|
||||
- name: build tests-daily Docker image
|
||||
run: docker build -t local-tests-daily tests-daily
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -e SKIP_YOUTUBE_TEST="True" -v $PWD/tests-daily/daily.py:/app/daily.py -v $PWD/output:/output local-tests-daily bash -c "cd /app && pytest -v --log-level=INFO --log-format='%(levelname)s - %(message)s' daily.py"
|
||||
53
.github/workflows/Publish.yml
vendored
Normal file
53
.github/workflows/Publish.yml
vendored
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
name: Publish released version
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
publish-amd64:
|
||||
runs-on: ubuntu-24.04
|
||||
name: "Publish for AMD64"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: openzim/docker-publish-action@v10
|
||||
with:
|
||||
image-name: openzim/zimit
|
||||
tag-pattern: /^v([0-9.]+)$/
|
||||
latest-on-tag: true
|
||||
restrict-to: openzim/zimit
|
||||
registries: ghcr.io
|
||||
credentials: |
|
||||
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
repo_description: auto
|
||||
repo_overview: auto
|
||||
platforms: |
|
||||
linux/amd64
|
||||
|
||||
# Disabled for now, see https://github.com/openzim/zimit/issues/463
|
||||
# publish-arm64:
|
||||
# runs-on: ubuntu-24.04
|
||||
# name: "Publish for ARM64"
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Build and push Docker image
|
||||
# uses: openzim/docker-publish-action@v10
|
||||
# with:
|
||||
# image-name: openzim/zimit
|
||||
# tag-pattern: /^v([0-9.]+)$/
|
||||
# latest-on-tag: true
|
||||
# restrict-to: openzim/zimit
|
||||
# registries: ghcr.io
|
||||
# credentials: |
|
||||
# GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
# GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
# repo_description: auto
|
||||
# repo_overview: auto
|
||||
# platforms: |
|
||||
# linux/arm64
|
||||
55
.github/workflows/PublishDockerDevImage.yaml
vendored
Normal file
55
.github/workflows/PublishDockerDevImage.yaml
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
name: Publish Docker dev image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
publish-amd64:
|
||||
runs-on: ubuntu-24.04
|
||||
name: "Publish for AMD64"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: openzim/docker-publish-action@v10
|
||||
with:
|
||||
image-name: openzim/zimit
|
||||
manual-tag: dev
|
||||
latest-on-tag: false
|
||||
restrict-to: openzim/zimit
|
||||
registries: ghcr.io
|
||||
credentials: |
|
||||
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
repo_description: auto
|
||||
repo_overview: auto
|
||||
platforms: |
|
||||
linux/amd64
|
||||
|
||||
# Disabled for now, see https://github.com/openzim/zimit/issues/463
|
||||
# publish-arm64:
|
||||
# runs-on: ubuntu-24.04-arm
|
||||
# name: "Publish for ARM64"
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Build and push Docker image
|
||||
# uses: openzim/docker-publish-action@v10
|
||||
# with:
|
||||
# image-name: openzim/zimit
|
||||
# manual-tag: dev
|
||||
# latest-on-tag: false
|
||||
# restrict-to: openzim/zimit
|
||||
# registries: ghcr.io
|
||||
# credentials: |
|
||||
# GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
# GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
# repo_description: auto
|
||||
# repo_overview: auto
|
||||
# platforms: |
|
||||
# linux/arm64
|
||||
34
.github/workflows/QA.yaml
vendored
Normal file
34
.github/workflows/QA.yaml
vendored
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
name: QA
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
check-qa:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version-file: pyproject.toml
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies (and project)
|
||||
run: |
|
||||
pip install -U pip
|
||||
pip install -e .[lint,scripts,test,check]
|
||||
|
||||
- name: Check black formatting
|
||||
run: inv lint-black
|
||||
|
||||
- name: Check ruff
|
||||
run: inv lint-ruff
|
||||
|
||||
- name: Check pyright
|
||||
run: inv check-pyright
|
||||
81
.github/workflows/Tests.yaml
vendored
Normal file
81
.github/workflows/Tests.yaml
vendored
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
name: Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
run-tests:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version-file: pyproject.toml
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies (and project)
|
||||
run: |
|
||||
pip install -U pip
|
||||
pip install -e .[test,scripts]
|
||||
|
||||
- name: Run the tests
|
||||
run: inv coverage --args "-vvv"
|
||||
|
||||
- name: Upload coverage report to codecov
|
||||
uses: codecov/codecov-action@v4
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
build_python:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version-file: pyproject.toml
|
||||
architecture: x64
|
||||
|
||||
- name: Ensure we can build Python targets
|
||||
run: |
|
||||
pip install -U pip build
|
||||
python3 -m build --sdist --wheel
|
||||
|
||||
# this job replaces the standard "build_docker" job since it builds the docker image
|
||||
run-integration-tests:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: build image
|
||||
run: docker build -t local-zimit .
|
||||
|
||||
- name: ensure help display without issue
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --help
|
||||
|
||||
- name: run crawl with soft size limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json
|
||||
|
||||
- name: run crawl with hard size limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true
|
||||
|
||||
- name: run crawl with soft time limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json
|
||||
|
||||
- name: run crawl with hard time limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true
|
||||
|
||||
- name: run standard crawl
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
|
||||
45
.github/workflows/update-zim-offliner-definition.yaml
vendored
Normal file
45
.github/workflows/update-zim-offliner-definition.yaml
vendored
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
name: Update ZIMFarm Definitions
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "offliner-definition.json"
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: "Version to publish"
|
||||
required: false
|
||||
default: "dev"
|
||||
|
||||
jobs:
|
||||
prepare-json:
|
||||
runs-on: ubuntu-24.04
|
||||
outputs:
|
||||
offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- id: read-json
|
||||
run: |
|
||||
if [ ! -f "offliner-definition.json" ]; then
|
||||
echo "File not found!" >&2
|
||||
exit 1
|
||||
fi
|
||||
json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)")
|
||||
echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT
|
||||
call-workflow:
|
||||
needs: prepare-json
|
||||
uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
|
||||
with:
|
||||
version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }}
|
||||
offliner: zimit
|
||||
offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
|
||||
secrets:
|
||||
zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}
|
||||
364
.gitignore
vendored
364
.gitignore
vendored
|
|
@ -1,4 +1,362 @@
|
|||
*.pyc
|
||||
__pycache__
|
||||
*.zim
|
||||
# Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,visualstudiocode,intellij
|
||||
|
||||
### Intellij ###
|
||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
# User-specific stuff
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
.idea/**/usage.statistics.xml
|
||||
.idea/**/dictionaries
|
||||
.idea/**/shelf
|
||||
|
||||
# AWS User-specific
|
||||
.idea/**/aws.xml
|
||||
|
||||
# Generated files
|
||||
.idea/**/contentModel.xml
|
||||
|
||||
# Sensitive or high-churn files
|
||||
.idea/**/dataSources/
|
||||
.idea/**/dataSources.ids
|
||||
.idea/**/dataSources.local.xml
|
||||
.idea/**/sqlDataSources.xml
|
||||
.idea/**/dynamic.xml
|
||||
.idea/**/uiDesigner.xml
|
||||
.idea/**/dbnavigator.xml
|
||||
|
||||
# Gradle
|
||||
.idea/**/gradle.xml
|
||||
.idea/**/libraries
|
||||
|
||||
# Gradle and Maven with auto-import
|
||||
# When using Gradle or Maven with auto-import, you should exclude module files,
|
||||
# since they will be recreated, and may cause churn. Uncomment if using
|
||||
# auto-import.
|
||||
# .idea/artifacts
|
||||
# .idea/compiler.xml
|
||||
# .idea/jarRepositories.xml
|
||||
# .idea/modules.xml
|
||||
# .idea/*.iml
|
||||
# .idea/modules
|
||||
# *.iml
|
||||
# *.ipr
|
||||
|
||||
# CMake
|
||||
cmake-build-*/
|
||||
|
||||
# Mongo Explorer plugin
|
||||
.idea/**/mongoSettings.xml
|
||||
|
||||
# File-based project format
|
||||
*.iws
|
||||
|
||||
# IntelliJ
|
||||
out/
|
||||
|
||||
# mpeltonen/sbt-idea plugin
|
||||
.idea_modules/
|
||||
|
||||
# JIRA plugin
|
||||
atlassian-ide-plugin.xml
|
||||
|
||||
# Cursive Clojure plugin
|
||||
.idea/replstate.xml
|
||||
|
||||
# SonarLint plugin
|
||||
.idea/sonarlint/
|
||||
|
||||
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||
com_crashlytics_export_strings.xml
|
||||
crashlytics.properties
|
||||
crashlytics-build.properties
|
||||
fabric.properties
|
||||
|
||||
# Editor-based Rest Client
|
||||
.idea/httpRequests
|
||||
|
||||
# Android studio 3.1+ serialized cache file
|
||||
.idea/caches/build_file_checksums.ser
|
||||
|
||||
### Intellij Patch ###
|
||||
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
|
||||
|
||||
# *.iml
|
||||
# modules.xml
|
||||
# .idea/misc.xml
|
||||
# *.ipr
|
||||
|
||||
# Sonarlint plugin
|
||||
# https://plugins.jetbrains.com/plugin/7973-sonarlint
|
||||
.idea/**/sonarlint/
|
||||
|
||||
# SonarQube Plugin
|
||||
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
|
||||
.idea/**/sonarIssues.xml
|
||||
|
||||
# Markdown Navigator plugin
|
||||
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
|
||||
.idea/**/markdown-navigator.xml
|
||||
.idea/**/markdown-navigator-enh.xml
|
||||
.idea/**/markdown-navigator/
|
||||
|
||||
# Cache file creation bug
|
||||
# See https://youtrack.jetbrains.com/issue/JBR-2257
|
||||
.idea/$CACHE_FILE$
|
||||
|
||||
# CodeStream plugin
|
||||
# https://plugins.jetbrains.com/plugin/12206-codestream
|
||||
.idea/codestream.xml
|
||||
|
||||
# Azure Toolkit for IntelliJ plugin
|
||||
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
|
||||
.idea/**/azureSettings.xml
|
||||
|
||||
### Linux ###
|
||||
*~
|
||||
|
||||
# temporary files which can be created if a process still has a handle open of a deleted file
|
||||
.fuse_hidden*
|
||||
|
||||
# KDE directory preferences
|
||||
.directory
|
||||
|
||||
# Linux trash folder which might appear on any partition or disk
|
||||
.Trash-*
|
||||
|
||||
# .nfs files are created when an open file is removed but is still being accessed
|
||||
.nfs*
|
||||
|
||||
### macOS ###
|
||||
# General
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Icon must end with two \r
|
||||
Icon
|
||||
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
|
||||
# Files that might appear in the root of a volume
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Directories potentially created on remote AFP share
|
||||
.AppleDB
|
||||
.AppleDesktop
|
||||
Network Trash Folder
|
||||
Temporary Items
|
||||
.apdisk
|
||||
|
||||
### macOS Patch ###
|
||||
# iCloud generated files
|
||||
*.icloud
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
### Python Patch ###
|
||||
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||
poetry.toml
|
||||
|
||||
# ruff
|
||||
.ruff_cache/
|
||||
|
||||
# LSP config files
|
||||
pyrightconfig.json
|
||||
|
||||
### VisualStudioCode ###
|
||||
.vscode/*
|
||||
!.vscode/settings.json
|
||||
!.vscode/tasks.json
|
||||
!.vscode/launch.json
|
||||
!.vscode/extensions.json
|
||||
!.vscode/*.code-snippets
|
||||
|
||||
# Local History for Visual Studio Code
|
||||
.history/
|
||||
|
||||
# Built Visual Studio Code Extensions
|
||||
*.vsix
|
||||
|
||||
### VisualStudioCode Patch ###
|
||||
# Ignore all local history of files
|
||||
.history
|
||||
.ionide
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij
|
||||
|
||||
# output dir
|
||||
output
|
||||
|
||||
# ignore all vscode, this editor specific, not maintained by openzim
|
||||
.vscode
|
||||
|
|
|
|||
27
.pre-commit-config.yaml
Normal file
27
.pre-commit-config.yaml
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# See https://pre-commit.com for more information
|
||||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- repo: https://github.com/psf/black
|
||||
rev: "25.1.0"
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.4
|
||||
hooks:
|
||||
- id: ruff
|
||||
- repo: https://github.com/RobertCraigie/pyright-python
|
||||
rev: v1.1.393
|
||||
hooks:
|
||||
- id: pyright
|
||||
name: pyright (system)
|
||||
description: 'pyright static type checker'
|
||||
entry: pyright
|
||||
language: system
|
||||
'types_or': [python, pyi]
|
||||
require_serial: true
|
||||
minimum_pre_commit_version: '2.9.2'
|
||||
409
CHANGELOG.md
Normal file
409
CHANGELOG.md
Normal file
|
|
@ -0,0 +1,409 @@
|
|||
## Changelog
|
||||
|
||||
All notable changes to this project are documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
|
||||
|
||||
### Changed
|
||||
- Fix issues preventing interrupted crawls from being resumed. (#499)
|
||||
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
|
||||
- Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed.
|
||||
- Don't cleanup an explicitly passed build directory.
|
||||
|
||||
## [3.0.5] - 2024-04-11
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.6.0 (#493)
|
||||
|
||||
## [3.0.4] - 2024-04-04
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.10 (#491)
|
||||
|
||||
## [3.0.3] - 2024-02-28
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.7 (#483)
|
||||
|
||||
## [3.0.2] - 2024-02-27
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.6 (#482)
|
||||
|
||||
## [3.0.1] - 2024-02-24
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.4 (#476)
|
||||
|
||||
## [3.0.0] - 2024-02-17
|
||||
|
||||
### Changed
|
||||
|
||||
- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
|
||||
- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
|
||||
- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
|
||||
- Document all Browsertrix Crawler default arguments values (#416)
|
||||
- Use preferred Browsertrix Crawler arguments names: (part of #471)
|
||||
- `--seeds` instead of `--url`
|
||||
- `--seedFile` instead of `--urlFile`
|
||||
- `--pageLimit` instead of `--limit`
|
||||
- `--pageLoadTimeout` instead of `--timeout`
|
||||
- `--scopeIncludeRx` instead of `--include`
|
||||
- `--scopeExcludeRx` instead of `--exclude`
|
||||
- `--pageExtraDelay` instead of `--delay`
|
||||
- Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471)
|
||||
- `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments)
|
||||
- `--zimit-progress-file` is now the zimit stats location
|
||||
- `--warc2zim-progress-file` is the warc2zim stats location
|
||||
- all are optional values, if not set and needed temporary files are used
|
||||
|
||||
### Fixed
|
||||
|
||||
- Do not create the ZIM when crawl is incomplete (#444)
|
||||
|
||||
## [2.1.8] - 2024-02-07
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464)
|
||||
|
||||
## [2.1.7] - 2024-01-10
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.4.2 (#450)
|
||||
- Upgrade to warc2zim 2.2.0
|
||||
|
||||
## [2.1.6] - 2024-11-07
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.3.5 (#426)
|
||||
|
||||
## [2.1.5] - 2024-11-01
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.3.4 and warc2zim 2.1.3 (#424)
|
||||
|
||||
## [2.1.4] - 2024-10-11
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.3.3 (#411)
|
||||
|
||||
## [2.1.3] - 2024-10-08
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.3.2, warc2zim 2.1.2 and other dependencies (#406)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix help (#393)
|
||||
|
||||
## [2.1.2] - 2024-09-09
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.3.0-beta.1 (#387) (fixes "Ziming a website with huge assets (e.g. PDFs) is failing to proceed" - #380)
|
||||
|
||||
## [2.1.1] - 2024-09-05
|
||||
|
||||
### Added
|
||||
|
||||
- Add support for uncompressed tar archive in --warcs (#369)
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.3.0-beta.0 (#379), including upgrage to Ubuntu Noble (#307)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Stream files downloads to not exhaust memory (#373)
|
||||
- Fix documentation on `--diskUtilization` setting (#375)
|
||||
|
||||
## [2.1.0] - 2024-08-09
|
||||
|
||||
### Added
|
||||
|
||||
- Add `--custom-behaviors` argument to support path/HTTP(S) URL custom behaviors to pass to the crawler (#313)
|
||||
- Add daily automated end-to-end tests of a page with Youtube player (#330)
|
||||
- Add `--warcs` option to directly process WARC files (#301)
|
||||
|
||||
### Changed
|
||||
|
||||
- Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288)
|
||||
- Fix README imprecisions + add back warc2zim availability in docker image (#314)
|
||||
- Enhance integration test to assert final content of the ZIM (#287)
|
||||
- Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354)
|
||||
- Do not log number of WARC files found (#357)
|
||||
- Upgrade dependencies (warc2zim 2.1.0)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Sort WARC directories found by modification time (#366)
|
||||
|
||||
## [2.0.6] - 2024-08-02
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgraded Browsertrix Crawler to 1.2.6
|
||||
|
||||
## [2.0.5] - 2024-07-24
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgraded Browsertrix Crawler to 1.2.5
|
||||
- Upgraded warc2zim to 2.0.3
|
||||
|
||||
## [2.0.4] - 2024-07-15
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgraded Browsertrix Crawler to 1.2.4 (fixes retrieve automatically the assets present in a data-xxx tag #316)
|
||||
|
||||
## [2.0.3] - 2024-06-24
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgraded Browsertrix Crawler to 1.2.0 (fixes Youtube videos issue #323)
|
||||
|
||||
## [2.0.2] - 2024-06-18
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade dependencies (mainly warc2zim 2.0.2)
|
||||
|
||||
|
||||
## [2.0.1] - 2024-06-13
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade dependencies (especially warc2zim 2.0.1 and browsertrix crawler 1.2.0-beta.0) (#318)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Crawler is not correctly checking disk size / usage (#305)
|
||||
|
||||
## [2.0.0] - 2024-06-04
|
||||
|
||||
### Added
|
||||
|
||||
- New `--version` flag to display Zimit version (#234)
|
||||
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
|
||||
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
|
||||
- New `--noMobileDevice` CLI argument
|
||||
- Publish Docker image for `linux/arm64` (in addition to `linux/amd64`) (#178)
|
||||
|
||||
### Changed
|
||||
|
||||
- **Use `warc2zim` version 2**, which works without Service Worker anymore (#193)
|
||||
- Upgraded Browsertrix Crawler to 1.1.3
|
||||
- Adopt Python bootstrap conventions
|
||||
- Upgrade to Python 3.12 + upgrade dependencies
|
||||
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim (#284)
|
||||
- Drop initial check of URL in Python (#256)
|
||||
- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
|
||||
- `--userAgent` CLI arguement is not mandatory anymore
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix support for Youtube videos (#291)
|
||||
- Fix crawler `--waitUntil` values (#289)
|
||||
|
||||
## [1.6.3] - 2024-01-18
|
||||
|
||||
### Changed
|
||||
|
||||
- Adapt to new `warc2zim` code structure
|
||||
- Using browsertrix-crawler 0.12.4
|
||||
- Using warc2zim 1.5.5
|
||||
|
||||
### Added
|
||||
|
||||
- New `--build` parameter (optional) to specify the directory holding Browsertrix files ; if not set, `--output`
|
||||
directory is used ; zimit creates one subdir of this folder per invocation to isolate datasets ; subdir is kept only
|
||||
if `--keep` is set.
|
||||
|
||||
### Fixed
|
||||
|
||||
- `--collection` parameter was not working (#252)
|
||||
|
||||
## [1.6.2] - 2023-11-17
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.12.3
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix logic passing args to crawler to support value '0' (#245)
|
||||
- Fix documentation about Chrome and headless (#248)
|
||||
|
||||
## [1.6.1] - 2023-11-06
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.12.1
|
||||
|
||||
## [1.6.0] - 2023-11-02
|
||||
|
||||
### Changed
|
||||
|
||||
- Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
|
||||
- User-Agent now has a default value (#228)
|
||||
- Manipulation of spaces with UA suffix and adminEmail has been modified
|
||||
- Same User-Agent is used for check_url (Python) and Browsertrix crawler (#227)
|
||||
- Using browsertrix-crawler 0.12.0
|
||||
|
||||
## [1.5.3] - 2023-10-02
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.11.2
|
||||
|
||||
## [1.5.2] - 2023-09-19
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.11.1
|
||||
|
||||
## [1.5.1] - 2023-09-18
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.11.0
|
||||
- Scraper stat file is not created empty (#211)
|
||||
- Crawler statistics are not available anymore (#213)
|
||||
- Using warc2zim 1.5.4
|
||||
|
||||
## [1.5.0] - 2023-08-23
|
||||
|
||||
### Added
|
||||
|
||||
- `--long-description` param
|
||||
|
||||
## [1.4.1] - 2023-08-23
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.10.4
|
||||
- Using warc2zim 1.5.3
|
||||
|
||||
## [1.4.0] - 2023-08-02
|
||||
|
||||
### Added
|
||||
|
||||
- `--title` to set ZIM title
|
||||
- `--description` to set ZIM description
|
||||
- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
|
||||
- `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3)
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.10.2
|
||||
- Default and accepted values for `--waitUntil` from crawler's update
|
||||
- Using warc2zim 1.5.2
|
||||
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
|
||||
- `--failOnFailedSeed` used inconditionally
|
||||
- `--lang` now passed to crawler (ISO-639-1)
|
||||
|
||||
### Removed
|
||||
|
||||
- `--newContext` from crawler's update
|
||||
|
||||
## [1.3.1] - 2023-02-06
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.8.0
|
||||
- Using warc2zim version 1.5.1 with wabac.js 2.15.2
|
||||
|
||||
## [1.3.0] - 2023-02-02
|
||||
|
||||
### Added
|
||||
|
||||
- Initial url check normalizes homepage redirects to standart ports – 80/443 (#137)
|
||||
|
||||
### Changed
|
||||
|
||||
- Using warc2zim version 1.5.0 with scope conflict fix and videos fix
|
||||
- Using browsertrix-crawler 0.8.0-beta.1
|
||||
- Fixed `--allowHashUrls` being a boolean param
|
||||
- Increased `check_url` timeout (12s to connect, 27s to read) instead of 10s
|
||||
|
||||
## [1.2.0] - 2022-06-21
|
||||
|
||||
### Added
|
||||
|
||||
- `--urlFile` browsertrix crawler parameter
|
||||
- `--depth` browsertrix crawler parameter
|
||||
- `--extraHops`, parameter
|
||||
- `--collection` browsertrix crawler parameter
|
||||
- `--allowHashUrls` browsertrix crawler parameter
|
||||
- `--userAgentSuffix` browsertrix crawler parameter
|
||||
- `--behaviors`, parameter
|
||||
- `--behaviorTimeout` browsertrix crawler parameter
|
||||
- `--profile` browsertrix crawler parameter
|
||||
- `--sizeLimit` browsertrix crawler parameter
|
||||
- `--timeLimit` browsertrix crawler parameter
|
||||
- `--healthCheckPort`, parameter
|
||||
- `--overwrite` parameter
|
||||
|
||||
### Changed
|
||||
|
||||
- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
|
||||
- default WARC location after crawl changed
|
||||
from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
|
||||
|
||||
### Removed
|
||||
|
||||
- `--scroll` browsertrix crawler parameter (see `--behaviors`)
|
||||
- `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`)
|
||||
|
||||
|
||||
## [1.1.5]
|
||||
|
||||
- using crawler 0.3.2 and warc2zim 1.3.6
|
||||
|
||||
## [1.1.4]
|
||||
|
||||
- Defaults to `load,networkidle0` for waitUntil param (same as crawler)
|
||||
- Allows setting combinations of values for waitUntil param
|
||||
- Updated warc2zim to 1.3.5
|
||||
- Updated browsertrix-crawler to 0.3.1
|
||||
- Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
|
||||
`capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
|
||||
|
||||
## [1.1.3]
|
||||
|
||||
- allows same first-level-domain redirects
|
||||
- fixed redirects to URL in scope
|
||||
- updated crawler to 0.2.0
|
||||
- `statsFilename` now informs whether limit was hit or not
|
||||
|
||||
## [1.1.2]
|
||||
|
||||
- added support for --custom-css
|
||||
- added domains block list (dfault)
|
||||
|
||||
## [1.1.1]
|
||||
|
||||
- updated browsertrix-crawler to 0.1.4
|
||||
- autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
|
||||
|
||||
## [1.0]
|
||||
|
||||
- initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3
|
||||
49
Dockerfile
Normal file
49
Dockerfile
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
FROM webrecorder/browsertrix-crawler:1.6.0
|
||||
LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
|
||||
|
||||
# add deadsnakes ppa for latest Python on Ubuntu
|
||||
RUN add-apt-repository ppa:deadsnakes/ppa -y
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -qqy --no-install-recommends \
|
||||
libmagic1 \
|
||||
python3.13-venv \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# python setup (in venv not to conflict with browsertrix)
|
||||
&& python3.13 -m venv /app/zimit \
|
||||
# placeholder (default output location)
|
||||
&& mkdir -p /output \
|
||||
# disable chrome upgrade
|
||||
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
|
||||
# download list of bad domains to filter-out. intentionnaly ran post-install \
|
||||
# so it's not cached in earlier layers (url stays same but content updated) \
|
||||
&& mkdir -p /tmp/ads \
|
||||
&& cd /tmp/ads \
|
||||
&& curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
|
||||
&& curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
|
||||
&& curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
|
||||
&& cat ./*.txt > /etc/blocklist.txt \
|
||||
&& rm ./*.txt \
|
||||
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
|
||||
&& chmod +x /usr/local/bin/entrypoint.sh
|
||||
|
||||
# Copy pyproject.toml and its dependencies
|
||||
COPY pyproject.toml README.md /src/
|
||||
COPY src/zimit/__about__.py /src/src/zimit/__about__.py
|
||||
|
||||
# Install Python dependencies
|
||||
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
|
||||
|
||||
# Copy code + associated artifacts
|
||||
COPY src /src/src
|
||||
COPY *.md /src/
|
||||
|
||||
# Install + cleanup
|
||||
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
|
||||
&& ln -s /app/zimit/bin/zimit /usr/bin/zimit \
|
||||
&& ln -s /app/zimit/bin/warc2zim /usr/bin/warc2zim \
|
||||
&& chmod +x /usr/bin/zimit \
|
||||
&& rm -rf /src
|
||||
|
||||
ENTRYPOINT ["entrypoint.sh"]
|
||||
CMD ["zimit", "--help"]
|
||||
674
LICENSE
Normal file
674
LICENSE
Normal file
|
|
@ -0,0 +1,674 @@
|
|||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU General Public License is a free, copyleft license for
|
||||
software and other kinds of works.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
the GNU General Public License is intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users. We, the Free Software Foundation, use the
|
||||
GNU General Public License for most of our software; it applies also to
|
||||
any other work released this way by its authors. You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to prevent others from denying you
|
||||
these rights or asking you to surrender the rights. Therefore, you have
|
||||
certain responsibilities if you distribute copies of the software, or if
|
||||
you modify it: responsibilities to respect the freedom of others.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must pass on to the recipients the same
|
||||
freedoms that you received. You must make sure that they, too, receive
|
||||
or can get the source code. And you must show them these terms so they
|
||||
know their rights.
|
||||
|
||||
Developers that use the GNU GPL protect your rights with two steps:
|
||||
(1) assert copyright on the software, and (2) offer you this License
|
||||
giving you legal permission to copy, distribute and/or modify it.
|
||||
|
||||
For the developers' and authors' protection, the GPL clearly explains
|
||||
that there is no warranty for this free software. For both users' and
|
||||
authors' sake, the GPL requires that modified versions be marked as
|
||||
changed, so that their problems will not be attributed erroneously to
|
||||
authors of previous versions.
|
||||
|
||||
Some devices are designed to deny users access to install or run
|
||||
modified versions of the software inside them, although the manufacturer
|
||||
can do so. This is fundamentally incompatible with the aim of
|
||||
protecting users' freedom to change the software. The systematic
|
||||
pattern of such abuse occurs in the area of products for individuals to
|
||||
use, which is precisely where it is most unacceptable. Therefore, we
|
||||
have designed this version of the GPL to prohibit the practice for those
|
||||
products. If such problems arise substantially in other domains, we
|
||||
stand ready to extend this provision to those domains in future versions
|
||||
of the GPL, as needed to protect the freedom of users.
|
||||
|
||||
Finally, every program is threatened constantly by software patents.
|
||||
States should not allow patents to restrict development and use of
|
||||
software on general-purpose computers, but in those that do, we wish to
|
||||
avoid the special danger that patents applied to a free program could
|
||||
make it effectively proprietary. To prevent this, the GPL assures that
|
||||
patents cannot be used to render the program non-free.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Use with the GNU Affero General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU Affero General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the special requirements of the GNU Affero General Public License,
|
||||
section 13, concerning interaction through a network will apply to the
|
||||
combination as such.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, your program's commands
|
||||
might be different; for a GUI interface, you would use an "about box".
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU GPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
|
||||
The GNU General Public License does not permit incorporating your program
|
||||
into proprietary programs. If your program is a subroutine library, you
|
||||
may consider it more useful to permit linking proprietary applications with
|
||||
the library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License. But first, please read
|
||||
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
||||
90
README.md
Normal file
90
README.md
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
Zimit
|
||||
=====
|
||||
|
||||
Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site.
|
||||
|
||||
[](https://www.codefactor.io/repository/github/openzim/zimit)
|
||||
[](https://www.gnu.org/licenses/gpl-3.0)
|
||||
[](https://ghcr.io/openzim/zimit)
|
||||
|
||||
Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing).
|
||||
|
||||
Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**.
|
||||
|
||||
Capabilities and known limitations
|
||||
--------------------
|
||||
|
||||
While we would like to support as many websites as possible, making an offline archive of any website with a versatile tool obviously has some limitations.
|
||||
|
||||
Most capabilities and known limitations are documented in [warc2zim README](https://github.com/openzim/warc2zim/blob/main/README.md). There are also some limitations in Browsertrix Crawler (used to fetch the website) and wombat (used to properly replay dynamic web requests), but these are not (yet?) clearly documented.
|
||||
|
||||
Technical background
|
||||
--------------------
|
||||
|
||||
Zimit runs a fully automated browser-based crawl of a website property and produces a ZIM of the crawled content. Zimit runs in a Docker container.
|
||||
|
||||
The system:
|
||||
- runs a website crawl with [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler), which produces WARC files
|
||||
- converts the crawled WARC files to a single ZIM using [warc2zim](https://github.com/openzim/warc2zim)
|
||||
|
||||
The `zimit.py` is the entrypoint for the system.
|
||||
|
||||
After the crawl is done, warc2zim is used to write a zim to the `/output` directory, which should be mounted as a volume to not loose the ZIM created when container stops.
|
||||
|
||||
Using the `--keep` flag, the crawled WARCs and few other artifacts will also be kept in a temp directory inside `/output`
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
`zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit.
|
||||
|
||||
The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**:
|
||||
|
||||
- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage
|
||||
- Required: `--name` - Name of ZIM file
|
||||
- `--output` - output directory (defaults to `/output`)
|
||||
- `--pageLimit U` - Limit capture to at most U URLs
|
||||
- `--scopeExcludeRx <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
|
||||
- `--workers N` - number of crawl workers to be run in parallel
|
||||
- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||
- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
|
||||
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
docker run ghcr.io/openzim/zimit zimit --help
|
||||
docker run ghcr.io/openzim/zimit warc2zim --help
|
||||
docker run -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile
|
||||
```
|
||||
|
||||
**Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`).
|
||||
|
||||
To re-build the Docker image locally run:
|
||||
|
||||
```bash
|
||||
docker build -t ghcr.io/openzim/zimit .
|
||||
```
|
||||
|
||||
FAQ
|
||||
---
|
||||
|
||||
The Zimit contributor's team maintains [a page with most Frequently Asked Questions](https://github.com/openzim/zimit/wiki/Frequently-Asked-Questions).
|
||||
|
||||
Nota bene
|
||||
---------
|
||||
|
||||
While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
|
||||
since Zimit 2.x which does not have any special requirements anymore.
|
||||
|
||||
It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
|
||||
the [Wikimania Esino Lario
|
||||
Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).
|
||||
|
||||
That version is now considered outdated and [archived in `2016`
|
||||
branch](https://github.com/openzim/zimit/tree/2016).
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
[GPLv3](https://www.gnu.org/licenses/gpl-3.0) or later, see
|
||||
[LICENSE](LICENSE) for more details.
|
||||
246
README.rst
246
README.rst
|
|
@ -1,246 +0,0 @@
|
|||
#####################################
|
||||
Create ZIM files out of HTTP websites
|
||||
#####################################
|
||||
|
||||
This project provides an API and an user interface in order to convert any
|
||||
website into a Zim file.
|
||||
|
||||
Exposed API
|
||||
###########
|
||||
|
||||
All APIs are talking JSON over HTTP. As such, all parameters should be sent as
|
||||
stringified JSON and the Content-Type should be set to "application/json".
|
||||
|
||||
POST /website-zim
|
||||
=================
|
||||
|
||||
By posting to this endpoint, you are asking the system to start a new download
|
||||
of a website and a conversion into a Zim format.
|
||||
|
||||
Required parameters
|
||||
-------------------
|
||||
|
||||
- **url**: URL of the website to be crawled
|
||||
- **title**: Title that will be used in the created Zim file
|
||||
- **email**: Email address that will get notified when the creation of the file is over
|
||||
|
||||
Optional parameters
|
||||
-------------------
|
||||
|
||||
- **language**: An `ISO 639-3 <https://en.wikipedia.org/wiki/ISO_639-3>`_ code
|
||||
representing the language
|
||||
- **welcome**: the page that will be first shown in the Zim file
|
||||
- **description**: The description that will be embedded in the Zim file
|
||||
- **author**: The author of the content
|
||||
|
||||
Return values
|
||||
-------------
|
||||
|
||||
- **job_id**: The job id is returned in JSON format. It can be used to know the
|
||||
status of the process.
|
||||
|
||||
Status codes
|
||||
------------
|
||||
|
||||
- `400 Bad Request` will be returned in case you are not respecting the
|
||||
expected inputs. In case of error, have a look at the body of the response:
|
||||
it contains information about what is missing.
|
||||
- `201 Created` will be returned if the process started.
|
||||
|
||||
Exemple
|
||||
-------
|
||||
|
||||
::
|
||||
|
||||
$ http POST http://0.0.0.0:6543/website-url url="https://refugeeinfo.eu/" title="Refugee Info" email="alexis@notmyidea.org"
|
||||
HTTP/1.1 201 Created
|
||||
|
||||
{
|
||||
"job": "5012abe3-bee2-4dd7-be87-39a88d76035d"
|
||||
}
|
||||
|
||||
|
||||
GET /status/{jobid}
|
||||
===================
|
||||
|
||||
Retrieve the status of a job and displays the associated logs.
|
||||
|
||||
Return values
|
||||
-------------
|
||||
|
||||
- **status**: The status of the job, it is one of 'queued', finished',
|
||||
'failed', 'started' and 'deferred'.
|
||||
- **log**: The logs of the job.
|
||||
|
||||
Status codes
|
||||
------------
|
||||
|
||||
- `404 Not Found` will be returned in case the requested job does not exist.
|
||||
- `200 OK` will be returned in any other case.
|
||||
|
||||
Exemple
|
||||
-------
|
||||
|
||||
::
|
||||
|
||||
http GET http://0.0.0.0:6543/status/5012abe3-bee2-4dd7-be87-39a88d76035d
|
||||
HTTP/1.1 200 OK
|
||||
|
||||
{
|
||||
"log": "<snip>",
|
||||
"status": "finished"
|
||||
}
|
||||
|
||||
|
||||
Okay, so how do I install it on my server?
|
||||
##########################################
|
||||
|
||||
Currently, the best way to install it is by retrieving the sources from github
|
||||
|
||||
::
|
||||
|
||||
$ git clone https://github.com/almet/zimit.git
|
||||
$ cd zimit
|
||||
|
||||
Create a virtual environment and install the project in it::
|
||||
|
||||
$ virtualenv venv
|
||||
$ venv/bin/pip install -e .
|
||||
|
||||
Then, run it how you want, for instance with pserve::
|
||||
|
||||
$ venv/bin/pserve zimit.ini
|
||||
|
||||
|
||||
In a separate process, you also need to run the worker::
|
||||
|
||||
$ venv/bin/rqworker
|
||||
|
||||
|
||||
And you're ready to go. To test it::
|
||||
|
||||
$ http POST http://0.0.0.0:6543/website-url url="https://refugeeinfo.eu/" title="Refugee Info" email="alexis@notmyidea.org"
|
||||
|
||||
|
||||
Debian dependencies
|
||||
####################
|
||||
|
||||
Installing the dependencies
|
||||
===========================
|
||||
|
||||
::
|
||||
|
||||
sudo apt-get install httrack libzim-dev libmagic-dev liblzma-dev libz-dev build-essential libtool libgumbo-dev redis-server automake pkg-config
|
||||
|
||||
Installing zimwriterfs
|
||||
======================
|
||||
|
||||
::
|
||||
|
||||
git clone https://github.com/wikimedia/openzim.git
|
||||
cd openzim/zimwriterfs
|
||||
./autogen.sh
|
||||
./configure
|
||||
make
|
||||
|
||||
Then upgrade the path to zimwriterfs executable in zimit.ini
|
||||
|
||||
::
|
||||
|
||||
$ rqworker & pserve zimit.ini
|
||||
|
||||
How to deploy?
|
||||
##############
|
||||
|
||||
There are multiple ways to deploy such service, so I'll describe how I do it
|
||||
with my own best-practices.
|
||||
|
||||
First of all, get all the dependencies and the code. I like to have everything
|
||||
available in /home/www, so let's consider this will be the case here::
|
||||
|
||||
$ mkdir /home/www/zimit.notmyidea.org
|
||||
$ cd /home/www/zimit.notmyidea.org
|
||||
$ git clone https://github.com/almet/zimit.git
|
||||
|
||||
Then, you can change the configuration file, by creating a new one::
|
||||
|
||||
$ cd zimit
|
||||
$ cp zimit.ini local.ini
|
||||
|
||||
From there, you need to update the configuration to point to the correct
|
||||
binaries and locations.
|
||||
|
||||
Nginx configuration
|
||||
===================
|
||||
|
||||
::
|
||||
|
||||
# the upstream component nginx needs to connect to
|
||||
upstream zimit_upstream {
|
||||
server unix:///tmp/zimit.sock;
|
||||
}
|
||||
|
||||
# configuration of the server
|
||||
server {
|
||||
listen 80;
|
||||
listen [::]:80;
|
||||
server_name zimit.ideascube.org;
|
||||
charset utf-8;
|
||||
|
||||
client_max_body_size 200M;
|
||||
|
||||
location /zims {
|
||||
alias /home/ideascube/zimit.ideascube.org/zims/;
|
||||
autoindex on;
|
||||
}
|
||||
|
||||
# Finally, send all non-media requests to the Pyramid server.
|
||||
location / {
|
||||
uwsgi_pass zimit_upstream;
|
||||
include /var/ideascube/uwsgi_params;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
UWSGI configuration
|
||||
===================
|
||||
|
||||
::
|
||||
|
||||
[uwsgi]
|
||||
uid = ideascube
|
||||
gid = ideascube
|
||||
chdir = /home/ideascube/zimit.ideascube.org/zimit/
|
||||
ini = /home/ideascube/zimit.ideascube.org/zimit/local.ini
|
||||
# the virtualenv (full path)
|
||||
home = /home/ideascube/zimit.ideascube.org/venv/
|
||||
|
||||
# process-related settings
|
||||
# master
|
||||
master = true
|
||||
# maximum number of worker processes
|
||||
processes = 4
|
||||
# the socket (use the full path to be safe
|
||||
socket = /tmp/zimit.sock
|
||||
# ... with appropriate permissions - may be needed
|
||||
chmod-socket = 666
|
||||
# stats = /tmp/ideascube.stats.sock
|
||||
# clear environment on exit
|
||||
vacuum = true
|
||||
plugins = python
|
||||
|
||||
|
||||
supervisord configuration
|
||||
=========================
|
||||
|
||||
::
|
||||
|
||||
[program:zimit-worker]
|
||||
command=/home/ideascube/zimit.ideascube.org/venv/bin/rqworker
|
||||
directory=/home/ideascube/zimit.ideascube.org/zimit/
|
||||
user=www-data
|
||||
autostart=true
|
||||
autorestart=true
|
||||
redirect_stderr=true
|
||||
|
||||
That's it!
|
||||
24
app.wsgi
24
app.wsgi
|
|
@ -1,24 +0,0 @@
|
|||
try:
|
||||
import ConfigParser as configparser
|
||||
except ImportError:
|
||||
import configparser
|
||||
import logging.config
|
||||
import os
|
||||
|
||||
from zimit import main
|
||||
|
||||
here = os.path.dirname(__file__)
|
||||
|
||||
ini_path = os.environ.get('ZIMIT_INI')
|
||||
if ini_path is None:
|
||||
ini_path = os.path.join(here, 'local.ini')
|
||||
|
||||
# Set up logging
|
||||
logging.config.fileConfig(ini_path)
|
||||
|
||||
# Parse config and create WSGI app
|
||||
config = configparser.ConfigParser()
|
||||
config.read(ini_path)
|
||||
|
||||
application = main(config.items('DEFAULT'), **dict(config.items('app:main'
|
||||
)))
|
||||
|
|
@ -1 +0,0 @@
|
|||
.alertify-logs>*{padding:12px 24px;color:#fff;box-shadow:0 2px 5px 0 rgba(0,0,0,.2);border-radius:1px}.alertify-logs>*,.alertify-logs>.default{background:rgba(0,0,0,.8)}.alertify-logs>.error{background:rgba(244,67,54,.8)}.alertify-logs>.success{background:rgba(76,175,80,.9)}.alertify{position:fixed;background-color:rgba(0,0,0,.3);left:0;right:0;top:0;bottom:0;width:100%;height:100%;z-index:1}.alertify.hide{opacity:0;pointer-events:none}.alertify,.alertify.show{box-sizing:border-box;transition:all .33s cubic-bezier(.25,.8,.25,1)}.alertify,.alertify *{box-sizing:border-box}.alertify .dialog{padding:12px}.alertify .alert,.alertify .dialog{width:100%;margin:0 auto;position:relative;top:50%;transform:translateY(-50%)}.alertify .alert>*,.alertify .dialog>*{width:400px;max-width:95%;margin:0 auto;text-align:center;padding:12px;background:#fff;box-shadow:0 2px 4px -1px rgba(0,0,0,.14),0 4px 5px 0 rgba(0,0,0,.098),0 1px 10px 0 rgba(0,0,0,.084)}.alertify .alert .msg,.alertify .dialog .msg{padding:12px;margin-bottom:12px;margin:0;text-align:left}.alertify .alert input:not(.form-control),.alertify .dialog input:not(.form-control){margin-bottom:15px;width:100%;font-size:100%;padding:12px}.alertify .alert input:not(.form-control):focus,.alertify .dialog input:not(.form-control):focus{outline-offset:-2px}.alertify .alert nav,.alertify .dialog nav{text-align:right}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button),.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button){background:transparent;box-sizing:border-box;color:rgba(0,0,0,.87);position:relative;outline:0;border:0;display:inline-block;-ms-flex-align:center;-ms-grid-row-align:center;align-items:center;padding:0 6px;margin:6px 8px;line-height:36px;min-height:36px;white-space:nowrap;min-width:88px;text-align:center;text-transform:uppercase;font-size:14px;text-decoration:none;cursor:pointer;border:1px solid transparent;border-radius:2px}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):active,.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):hover,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):active,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):hover{background-color:rgba(0,0,0,.05)}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):focus,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):focus{border:1px solid rgba(0,0,0,.1)}.alertify .alert nav button.btn,.alertify .dialog nav button.btn{margin:6px 4px}.alertify-logs{position:fixed;z-index:1}.alertify-logs.bottom,.alertify-logs:not(.top){bottom:16px}.alertify-logs.left,.alertify-logs:not(.right){left:16px}.alertify-logs.left>*,.alertify-logs:not(.right)>*{float:left;transform:translateZ(0);height:auto}.alertify-logs.left>.show,.alertify-logs:not(.right)>.show{left:0}.alertify-logs.left>*,.alertify-logs.left>.hide,.alertify-logs:not(.right)>*,.alertify-logs:not(.right)>.hide{left:-110%}.alertify-logs.right{right:16px}.alertify-logs.right>*{float:right;transform:translateZ(0)}.alertify-logs.right>.show{right:0;opacity:1}.alertify-logs.right>*,.alertify-logs.right>.hide{right:-110%;opacity:0}.alertify-logs.top{top:0}.alertify-logs>*{box-sizing:border-box;transition:all .4s cubic-bezier(.25,.8,.25,1);position:relative;clear:both;backface-visibility:hidden;perspective:1000;max-height:0;margin:0;padding:0;overflow:hidden;opacity:0;pointer-events:none}.alertify-logs>.show{margin-top:12px;opacity:1;max-height:1000px;padding:12px;pointer-events:auto}
|
||||
File diff suppressed because one or more lines are too long
7523
app/assets/bootstrap.css
vendored
7523
app/assets/bootstrap.css
vendored
File diff suppressed because it is too large
Load diff
|
|
@ -1,84 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
|
||||
<head>
|
||||
</head>
|
||||
<link rel="stylesheet" href="./assets/bootstrap.css">
|
||||
<link rel="stylesheet" href="./assets/alertify.css">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<title>Zimit — Create a zim archive out of a website URL</title>
|
||||
|
||||
<meta charset="utf-8" />
|
||||
<body>
|
||||
<div class="navbar navbar-default navbar-static-top">
|
||||
<div class="container">
|
||||
<div class="navbar-header">
|
||||
<a class="navbar-brand" href="#">Zim it!</a>
|
||||
</div>
|
||||
<div class="navbar-collapse collapse">
|
||||
<ul class="nav navbar-nav navbar-right">
|
||||
<li><a href="http://www.openzim.org/wiki/Mission">Our values</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="container">
|
||||
<form action="#" id="zimcreator" onSubmit="submitForm()">
|
||||
<div class="form-group field field-object">
|
||||
<fieldset>
|
||||
<div class="form-group field field-string">
|
||||
<label class="control-label" for="url">Website URL</label>
|
||||
<input id="url" label="Website URL" placeholder="https://google.com" class="form-control" type="url">
|
||||
</div>
|
||||
<div class="form-group field field-string">
|
||||
<label class="control-label" for="url">Zim Title</label>
|
||||
<input id="title" label="Website URL" placeholder="A great website" class="form-control" type="text">
|
||||
</div>
|
||||
<div class="form-group field field-string">
|
||||
<label class="control-label" for="url">Enter an email to be notified when this is finished</label>
|
||||
<input id="email" label="Email" placeholder="john@doe.com" class="form-control" type="email">
|
||||
</div>
|
||||
</fieldset>
|
||||
</div>
|
||||
<p>
|
||||
<button type="submit" class="btn btn-info">Create the Zim file!</button>
|
||||
</p>
|
||||
</form>
|
||||
<p>
|
||||
This is a <a href="http://www.openzim.org/wiki/OpenZIM">Zim</a> creator. Enter the <em>url</em> of the website you want ton turn in a zim file, a <em>title</em> and click on <em>Create zim File</em>
|
||||
</p>
|
||||
<p>Enjoy !</p>
|
||||
</div>
|
||||
<script src="./assets/alertify.js"></script>
|
||||
<script type="text/javascript">
|
||||
|
||||
function getField(field) {
|
||||
return document.forms['zimcreator'].elements[field].value;
|
||||
}
|
||||
|
||||
function submitForm() {
|
||||
var content = {
|
||||
url: getField('url'),
|
||||
title: getField('title'),
|
||||
email: getField('email'),
|
||||
}
|
||||
fetch("/website-zim", {
|
||||
method: "POST",
|
||||
body: JSON.stringify(content),
|
||||
headers: {'Content-Type': 'application/json'}
|
||||
}).then(function (result) {
|
||||
if (result.status >= 400) {
|
||||
alertify.error("The server wasn't able to start the job, please check your inputs.");
|
||||
} else {
|
||||
alertify.success("The job has been submitted! You'll receive an email when it's finished.");
|
||||
}
|
||||
})
|
||||
.catch(function (error) {
|
||||
alertify.error("Sorry, we weren't able to join the server. This is usually due to connectivity issues.");
|
||||
});
|
||||
return false;
|
||||
}
|
||||
</script>
|
||||
|
||||
</body>
|
||||
BIN
favicon.ico
BIN
favicon.ico
Binary file not shown.
|
Before Width: | Height: | Size: 9.1 KiB |
981
offliner-definition.json
Normal file
981
offliner-definition.json
Normal file
|
|
@ -0,0 +1,981 @@
|
|||
{
|
||||
"offliner_id": "zimit",
|
||||
"stdOutput": true,
|
||||
"stdStats": "zimit-progress-file",
|
||||
"flags": {
|
||||
"seeds": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Seeds",
|
||||
"description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage"
|
||||
},
|
||||
"seed_file": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Seed File",
|
||||
"description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file."
|
||||
},
|
||||
"lang": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Browser Language",
|
||||
"description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`"
|
||||
},
|
||||
"title": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Title",
|
||||
"description": "Custom title for your ZIM. Defaults to title of main page",
|
||||
"minLength": 1,
|
||||
"maxLength": 30
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Description",
|
||||
"description": "Description for ZIM",
|
||||
"minLength": 1,
|
||||
"maxLength": 80
|
||||
},
|
||||
"favicon": {
|
||||
"type": "blob",
|
||||
"kind": "image",
|
||||
"required": false,
|
||||
"title": "Illustration",
|
||||
"description": "URL for Illustration. "
|
||||
},
|
||||
"tags": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "ZIM Tags",
|
||||
"description": "Single string with individual tags separated by a semicolon."
|
||||
},
|
||||
"creator": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Creator",
|
||||
"description": "Name of content creator"
|
||||
},
|
||||
"publisher": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Publisher",
|
||||
"isPublisher": true,
|
||||
"description": "Custom publisher name (ZIM metadata). openZIM otherwise"
|
||||
},
|
||||
"source": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Source",
|
||||
"description": "Source name/URL of content"
|
||||
},
|
||||
"workers": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Workers",
|
||||
"description": "The number of workers to run in parallel. Defaults to 1",
|
||||
"min": 1
|
||||
},
|
||||
"wait_until": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "WaitUntil",
|
||||
"description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2"
|
||||
},
|
||||
"extra_hops": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Extra Hops",
|
||||
"description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0",
|
||||
"min": 0
|
||||
},
|
||||
"page_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Page Limit",
|
||||
"description": "Limit crawl to this number of pages. Default is 0 (no-limit).",
|
||||
"min": 0
|
||||
},
|
||||
"max_page_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Max Page Limit",
|
||||
"description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)",
|
||||
"min": 0
|
||||
},
|
||||
"page_load_timeout": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Page Load Timeout",
|
||||
"description": "Timeout for each page to load (in seconds). Default is 90",
|
||||
"min": 0
|
||||
},
|
||||
"scope_type": {
|
||||
"type": "string-enum",
|
||||
"required": false,
|
||||
"title": "Scope Type",
|
||||
"description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.",
|
||||
"choices": [
|
||||
{
|
||||
"title": "Page",
|
||||
"value": "page"
|
||||
},
|
||||
{
|
||||
"title": "Page SPA",
|
||||
"value": "page-spa"
|
||||
},
|
||||
{
|
||||
"title": "Prefix",
|
||||
"value": "prefix"
|
||||
},
|
||||
{
|
||||
"title": "Host",
|
||||
"value": "host"
|
||||
},
|
||||
{
|
||||
"title": "Domain",
|
||||
"value": "domain"
|
||||
},
|
||||
{
|
||||
"title": "Any",
|
||||
"value": "any"
|
||||
},
|
||||
{
|
||||
"title": "Custom",
|
||||
"value": "custom"
|
||||
}
|
||||
]
|
||||
},
|
||||
"scope_include_rx": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Scope Include Regex",
|
||||
"description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)"
|
||||
},
|
||||
"scope_exclude_rx": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Scope Exclude Regex",
|
||||
"description": "Regex of page URLs that should be excluded from the crawl"
|
||||
},
|
||||
"allow_hash_urls": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Allow Hashtag URLs",
|
||||
"description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content"
|
||||
},
|
||||
"mobile_device": {
|
||||
"type": "string-enum",
|
||||
"required": false,
|
||||
"title": "As device",
|
||||
"description": "Device to crawl as. See Pupeeter's Device.ts for a list",
|
||||
"choices": [
|
||||
{
|
||||
"title": "Blackberry Playbook",
|
||||
"value": "Blackberry PlayBook"
|
||||
},
|
||||
{
|
||||
"title": "Blackberry Playbook Landscape",
|
||||
"value": "Blackberry PlayBook landscape"
|
||||
},
|
||||
{
|
||||
"title": "Blackberry Z30",
|
||||
"value": "BlackBerry Z30"
|
||||
},
|
||||
{
|
||||
"title": "Blackberry Z30 Landscape",
|
||||
"value": "BlackBerry Z30 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Note 3",
|
||||
"value": "Galaxy Note 3"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Note 3 Landscape",
|
||||
"value": "Galaxy Note 3 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Note II",
|
||||
"value": "Galaxy Note II"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Note II Landscape",
|
||||
"value": "Galaxy Note II landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S III",
|
||||
"value": "Galaxy S III"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S III Landscape",
|
||||
"value": "Galaxy S III landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S5",
|
||||
"value": "Galaxy S5"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S5 Landscape",
|
||||
"value": "Galaxy S5 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S8",
|
||||
"value": "Galaxy S8"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S8 Landscape",
|
||||
"value": "Galaxy S8 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S9 Plus",
|
||||
"value": "Galaxy S9+"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S9 Plus Landscape",
|
||||
"value": "Galaxy S9+ landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Tab S4",
|
||||
"value": "Galaxy Tab S4"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Tab S4 Landscape",
|
||||
"value": "Galaxy Tab S4 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad",
|
||||
"value": "iPad"
|
||||
},
|
||||
{
|
||||
"title": "iPad Landscape",
|
||||
"value": "iPad landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Gen 6",
|
||||
"value": "iPad (gen 6)"
|
||||
},
|
||||
{
|
||||
"title": "iPad Gen 6 Landscape",
|
||||
"value": "iPad (gen 6) landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Gen 7",
|
||||
"value": "iPad (gen 7)"
|
||||
},
|
||||
{
|
||||
"title": "iPad Gen 7 Landscape",
|
||||
"value": "iPad (gen 7) landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Mini",
|
||||
"value": "iPad Mini"
|
||||
},
|
||||
{
|
||||
"title": "iPad Mini Landscape",
|
||||
"value": "iPad Mini landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Pro",
|
||||
"value": "iPad Pro"
|
||||
},
|
||||
{
|
||||
"title": "iPad Pro Landscape",
|
||||
"value": "iPad Pro landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Pro 11",
|
||||
"value": "iPad Pro 11"
|
||||
},
|
||||
{
|
||||
"title": "iPad Pro 11 Landscape",
|
||||
"value": "iPad Pro 11 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 4",
|
||||
"value": "iPhone 4"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 4 Landscape",
|
||||
"value": "iPhone 4 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 5",
|
||||
"value": "iPhone 5"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 5 Landscape",
|
||||
"value": "iPhone 5 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 6",
|
||||
"value": "iPhone 6"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 6 Landscape",
|
||||
"value": "iPhone 6 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 6 Plus",
|
||||
"value": "iPhone 6 Plus"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 6 Plus Landscape",
|
||||
"value": "iPhone 6 Plus landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 7",
|
||||
"value": "iPhone 7"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 7 Landscape",
|
||||
"value": "iPhone 7 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 7 Plus",
|
||||
"value": "iPhone 7 Plus"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 7 Plus Landscape",
|
||||
"value": "iPhone 7 Plus landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 8",
|
||||
"value": "iPhone 8"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 8 Landscape",
|
||||
"value": "iPhone 8 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 8 Plus",
|
||||
"value": "iPhone 8 Plus"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 8 Plus Landscape",
|
||||
"value": "iPhone 8 Plus landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone SE",
|
||||
"value": "iPhone SE"
|
||||
},
|
||||
{
|
||||
"title": "iPhone SE Landscape",
|
||||
"value": "iPhone SE landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone X",
|
||||
"value": "iPhone X"
|
||||
},
|
||||
{
|
||||
"title": "iPhone X Landscape",
|
||||
"value": "iPhone X landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone XR",
|
||||
"value": "iPhone XR"
|
||||
},
|
||||
{
|
||||
"title": "iPhone XR Landscape",
|
||||
"value": "iPhone XR landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11",
|
||||
"value": "iPhone 11"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Landscape",
|
||||
"value": "iPhone 11 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Pro",
|
||||
"value": "iPhone 11 Pro"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Pro Landscape",
|
||||
"value": "iPhone 11 Pro landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Pro Max",
|
||||
"value": "iPhone 11 Pro Max"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Pro Max Landscape",
|
||||
"value": "iPhone 11 Pro Max landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12",
|
||||
"value": "iPhone 12"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Landscape",
|
||||
"value": "iPhone 12 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Pro",
|
||||
"value": "iPhone 12 Pro"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Pro Landscape",
|
||||
"value": "iPhone 12 Pro landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Pro Max",
|
||||
"value": "iPhone 12 Pro Max"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Pro Max Landscape",
|
||||
"value": "iPhone 12 Pro Max landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Mini",
|
||||
"value": "iPhone 12 Mini"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Mini Landscape",
|
||||
"value": "iPhone 12 Mini landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13",
|
||||
"value": "iPhone 13"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Landscape",
|
||||
"value": "iPhone 13 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Pro",
|
||||
"value": "iPhone 13 Pro"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Pro Landscape",
|
||||
"value": "iPhone 13 Pro landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Pro Max",
|
||||
"value": "iPhone 13 Pro Max"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Pro Max Landscape",
|
||||
"value": "iPhone 13 Pro Max landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Mini",
|
||||
"value": "iPhone 13 Mini"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Mini Landscape",
|
||||
"value": "iPhone 13 Mini landscape"
|
||||
},
|
||||
{
|
||||
"title": "Jio Phone 2",
|
||||
"value": "JioPhone 2"
|
||||
},
|
||||
{
|
||||
"title": "Jio Phone 2 Landscape",
|
||||
"value": "JioPhone 2 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Kindle Fire HDX",
|
||||
"value": "Kindle Fire HDX"
|
||||
},
|
||||
{
|
||||
"title": "Kindle Fire HDX Landscape",
|
||||
"value": "Kindle Fire HDX landscape"
|
||||
},
|
||||
{
|
||||
"title": "LG Optimus L70",
|
||||
"value": "LG Optimus L70"
|
||||
},
|
||||
{
|
||||
"title": "LG Optimus L70 Landscape",
|
||||
"value": "LG Optimus L70 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Microsoft Lumia 550",
|
||||
"value": "Microsoft Lumia 550"
|
||||
},
|
||||
{
|
||||
"title": "Microsoft Lumia 950",
|
||||
"value": "Microsoft Lumia 950"
|
||||
},
|
||||
{
|
||||
"title": "Microsoft Lumia 950 Landscape",
|
||||
"value": "Microsoft Lumia 950 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 10",
|
||||
"value": "Nexus 10"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 10 Landscape",
|
||||
"value": "Nexus 10 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 4",
|
||||
"value": "Nexus 4"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 4 Landscape",
|
||||
"value": "Nexus 4 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 5",
|
||||
"value": "Nexus 5"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 5 Landscape",
|
||||
"value": "Nexus 5 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 5X",
|
||||
"value": "Nexus 5X"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 5X Landscape",
|
||||
"value": "Nexus 5X landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 6",
|
||||
"value": "Nexus 6"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 6 Landscape",
|
||||
"value": "Nexus 6 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 6P",
|
||||
"value": "Nexus 6P"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 6P Landscape",
|
||||
"value": "Nexus 6P landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 7",
|
||||
"value": "Nexus 7"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 7 Landscape",
|
||||
"value": "Nexus 7 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nokia Lumia 520",
|
||||
"value": "Nokia Lumia 520"
|
||||
},
|
||||
{
|
||||
"title": "Nokia Lumia 520 Landscape",
|
||||
"value": "Nokia Lumia 520 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nokia N9",
|
||||
"value": "Nokia N9"
|
||||
},
|
||||
{
|
||||
"title": "Nokia N9 Landscape",
|
||||
"value": "Nokia N9 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 2",
|
||||
"value": "Pixel 2"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 2 Landscape",
|
||||
"value": "Pixel 2 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 2 XL",
|
||||
"value": "Pixel 2 XL"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 2 XL Landscape",
|
||||
"value": "Pixel 2 XL landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 3",
|
||||
"value": "Pixel 3"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 3 Landscape",
|
||||
"value": "Pixel 3 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 4",
|
||||
"value": "Pixel 4"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 4 Landscape",
|
||||
"value": "Pixel 4 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 4A 5G",
|
||||
"value": "Pixel 4a (5G)"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 4A 5G Landscape",
|
||||
"value": "Pixel 4a (5G) landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 5",
|
||||
"value": "Pixel 5"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 5 Landscape",
|
||||
"value": "Pixel 5 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Moto G4",
|
||||
"value": "Moto G4"
|
||||
},
|
||||
{
|
||||
"title": "Moto G4 Landscape",
|
||||
"value": "Moto G4 landscape"
|
||||
}
|
||||
]
|
||||
},
|
||||
"select_links": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Select Links",
|
||||
"description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]"
|
||||
},
|
||||
"click_selector": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Click Selector",
|
||||
"description": "Selector for elements to click when using the autoclick behavior. Default is 'a'"
|
||||
},
|
||||
"block_rules": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Block Rules",
|
||||
"description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe"
|
||||
},
|
||||
"block_message": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Block Message",
|
||||
"description": "If specified, when a URL is blocked, a record with this error message is added instead"
|
||||
},
|
||||
"block_ads": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Block Ads",
|
||||
"description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set."
|
||||
},
|
||||
"ad_block_message": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Ads Block Message",
|
||||
"description": "If specified, when an ad is blocked, a record with this error message is added instead"
|
||||
},
|
||||
"user_agent": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "User Agent",
|
||||
"description": "Override user-agent with specified"
|
||||
},
|
||||
"user_agent_suffix": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "User Agent Suffix",
|
||||
"description": "Append suffix to existing browser user-agent. Defaults to +Zimit"
|
||||
},
|
||||
"use_sitemap": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Sitemap URL",
|
||||
"description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)"
|
||||
},
|
||||
"sitemap_from_date": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Sitemap From Date",
|
||||
"description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
|
||||
},
|
||||
"sitemap_to_date": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Sitemap To Date",
|
||||
"description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
|
||||
},
|
||||
"behavior_timeout": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Behavior Timeout",
|
||||
"description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.",
|
||||
"min": 0
|
||||
},
|
||||
"post_load_delay": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Post Load Delay",
|
||||
"description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.",
|
||||
"min": 0
|
||||
},
|
||||
"page_extra_delay": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Page Extra Delay",
|
||||
"description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.",
|
||||
"min": 0
|
||||
},
|
||||
"dedup_policy": {
|
||||
"type": "string-enum",
|
||||
"required": false,
|
||||
"title": "Dedup Policy",
|
||||
"description": "Deduplication policy. One of skip, revisit or keep. Default is skip",
|
||||
"choices": [
|
||||
{
|
||||
"title": "Skip",
|
||||
"value": "skip"
|
||||
},
|
||||
{
|
||||
"title": "Revisit",
|
||||
"value": "revisit"
|
||||
},
|
||||
{
|
||||
"title": "Keep",
|
||||
"value": "keep"
|
||||
}
|
||||
]
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Screenshot",
|
||||
"description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those."
|
||||
},
|
||||
"size_soft_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Size Soft Limit",
|
||||
"description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.",
|
||||
"min": 0
|
||||
},
|
||||
"size_hard_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Size Hard Limit",
|
||||
"description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value",
|
||||
"min": 0
|
||||
},
|
||||
"disk_utilization": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Disk Utilization",
|
||||
"description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.",
|
||||
"min": 0
|
||||
},
|
||||
"time_soft_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Time Soft Limit",
|
||||
"description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.",
|
||||
"min": 0
|
||||
},
|
||||
"time_hard_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Time Hard Limit",
|
||||
"description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds",
|
||||
"min": 0
|
||||
},
|
||||
"net_idle_wait": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Net Idle Wait",
|
||||
"description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope."
|
||||
},
|
||||
"origin_override": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Origin Override",
|
||||
"description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port."
|
||||
},
|
||||
"max_page_retries": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Max Page Retries",
|
||||
"description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.",
|
||||
"min": 0
|
||||
},
|
||||
"fail_on_failed_seed": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Fail on failed seed",
|
||||
"description": "Whether to display additional logs"
|
||||
},
|
||||
"fail_on_invalid_status": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Fail on invalid status",
|
||||
"description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses"
|
||||
},
|
||||
"fail_on_failed_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Fail on failed - Limit",
|
||||
"description": "If set, save state and exit if number of failed pages exceeds this value.",
|
||||
"min": 0
|
||||
},
|
||||
"warcs": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "WARC files",
|
||||
"description": "Comma-separated list of WARC files to use as input."
|
||||
},
|
||||
"verbose": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Verbose mode",
|
||||
"description": "Whether to display additional logs"
|
||||
},
|
||||
"keep": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Keep",
|
||||
"description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.",
|
||||
"default": true
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Output folder",
|
||||
"description": "Output folder for ZIM file(s). Leave it as `/output`",
|
||||
"pattern": "^/output$"
|
||||
},
|
||||
"admin_email": {
|
||||
"type": "email",
|
||||
"required": false,
|
||||
"title": "Admin Email",
|
||||
"description": "Admin Email for crawler: used in UserAgent so website admin can contact us",
|
||||
"default": "contact+zimfarm@kiwix.org"
|
||||
},
|
||||
"profile": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Browser profile",
|
||||
"description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler."
|
||||
},
|
||||
"behaviors": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Behaviors",
|
||||
"description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific."
|
||||
},
|
||||
"depth": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Depth",
|
||||
"description": "The depth of the crawl for all seeds. Default is -1 (infinite).",
|
||||
"min": -1
|
||||
},
|
||||
"zim_lang": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "ZIM Language",
|
||||
"description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`",
|
||||
"alias": "zim-lang",
|
||||
"customValidator": "language_code"
|
||||
},
|
||||
"long_description": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Long description",
|
||||
"description": "Optional long description for your ZIM",
|
||||
"minLength": 1,
|
||||
"maxLength": 4000,
|
||||
"alias": "long-description"
|
||||
},
|
||||
"custom_css": {
|
||||
"type": "blob",
|
||||
"kind": "css",
|
||||
"required": false,
|
||||
"title": "Custom CSS",
|
||||
"description": "URL to a CSS file to inject into pages",
|
||||
"alias": "custom-css"
|
||||
},
|
||||
"charsets_to_try": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Charsets to try",
|
||||
"description": "List of charsets to try decode content when charset is not found",
|
||||
"alias": "charsets-to-try"
|
||||
},
|
||||
"ignore_content_header_charsets": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Ignore Content Header Charsets",
|
||||
"description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.",
|
||||
"alias": "ignore-content-header-charsets"
|
||||
},
|
||||
"content_header_bytes_length": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Content Header Bytes Length",
|
||||
"description": "How many bytes to consider when searching for content charsets in header (default is 1024).",
|
||||
"alias": "content-header-bytes-length",
|
||||
"min": 0
|
||||
},
|
||||
"ignore_http_header_charsets": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Ignore HTTP Header Charsets",
|
||||
"description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.",
|
||||
"alias": "ignore-http-header-charsets"
|
||||
},
|
||||
"encoding_aliases": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Encoding Aliases",
|
||||
"description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.",
|
||||
"alias": "encoding-aliases"
|
||||
},
|
||||
"custom_behaviors": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Custom Behaviors",
|
||||
"description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.",
|
||||
"alias": "custom-behaviours"
|
||||
},
|
||||
"zimit_progress_file": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Zimit Progress File",
|
||||
"description": "Scraping progress file. Leave it as `/output/task_progress.json`",
|
||||
"alias": "zimit-progress-file",
|
||||
"pattern": "^/output/task_progress\\.json$"
|
||||
},
|
||||
"replay_viewer_source": {
|
||||
"type": "url",
|
||||
"required": false,
|
||||
"title": "Replay Viewer Source",
|
||||
"description": "URL from which to load the ReplayWeb.page replay viewer from",
|
||||
"alias": "replay-viewer-source"
|
||||
},
|
||||
"zim_file": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "ZIM filename",
|
||||
"description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically",
|
||||
"alias": "zim-file",
|
||||
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$",
|
||||
"relaxedPattern": "^[A-Za-z0-9._-]+$"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"required": true,
|
||||
"title": "ZIM name",
|
||||
"description": "Name of the ZIM.",
|
||||
"alias": "name",
|
||||
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
|
||||
"relaxedPattern": "^[A-Za-z0-9._-]+$"
|
||||
},
|
||||
"overwrite": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Overwrite",
|
||||
"description": "Whether to overwrite existing ZIM file if it exists"
|
||||
}
|
||||
}
|
||||
}
|
||||
225
pyproject.toml
Normal file
225
pyproject.toml
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
[build-system]
|
||||
requires = ["hatchling", "hatch-openzim"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "zimit"
|
||||
requires-python = ">=3.13,<3.14"
|
||||
description = "Make ZIM file from any website through crawling"
|
||||
readme = "README.md"
|
||||
dependencies = [
|
||||
"requests==2.32.3",
|
||||
"inotify==0.2.10",
|
||||
"tld==0.13",
|
||||
"warc2zim @ git+https://github.com/openzim/warc2zim@main",
|
||||
]
|
||||
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
|
||||
|
||||
[tool.hatch.metadata.hooks.openzim-metadata]
|
||||
kind = "scraper"
|
||||
|
||||
[tool.hatch.metadata]
|
||||
allow-direct-references = true # to be removed once we use a released warc2zim version
|
||||
|
||||
[project.optional-dependencies]
|
||||
scripts = [
|
||||
"invoke==2.2.0",
|
||||
]
|
||||
lint = [
|
||||
"black==25.1.0",
|
||||
"ruff==0.9.4",
|
||||
]
|
||||
check = [
|
||||
"pyright==1.1.393",
|
||||
]
|
||||
test = [
|
||||
"pytest==8.3.4",
|
||||
"coverage==7.6.10",
|
||||
]
|
||||
dev = [
|
||||
"pre-commit==4.1.0",
|
||||
"debugpy==1.8.12",
|
||||
"selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion)
|
||||
"zimit[scripts]",
|
||||
"zimit[lint]",
|
||||
"zimit[test]",
|
||||
"zimit[check]",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
zimit = "zimit:zimit.zimit"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/zimit/__about__.py"
|
||||
|
||||
[tool.hatch.build]
|
||||
exclude = [
|
||||
"/.github",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/zimit"]
|
||||
|
||||
[tool.hatch.envs.default]
|
||||
features = ["dev"]
|
||||
|
||||
[tool.hatch.envs.test]
|
||||
features = ["scripts", "test"]
|
||||
|
||||
[tool.hatch.envs.test.scripts]
|
||||
run = "inv test --args '{args}'"
|
||||
run-cov = "inv test-cov --args '{args}'"
|
||||
report-cov = "inv report-cov"
|
||||
coverage = "inv coverage --args '{args}'"
|
||||
html = "inv coverage --html --args '{args}'"
|
||||
|
||||
[tool.hatch.envs.lint]
|
||||
template = "lint"
|
||||
skip-install = false
|
||||
features = ["scripts", "lint"]
|
||||
|
||||
[tool.hatch.envs.lint.scripts]
|
||||
black = "inv lint-black --args '{args}'"
|
||||
ruff = "inv lint-ruff --args '{args}'"
|
||||
all = "inv lintall --args '{args}'"
|
||||
fix-black = "inv fix-black --args '{args}'"
|
||||
fix-ruff = "inv fix-ruff --args '{args}'"
|
||||
fixall = "inv fixall --args '{args}'"
|
||||
|
||||
[tool.hatch.envs.check]
|
||||
features = ["scripts", "check"]
|
||||
|
||||
[tool.hatch.envs.check.scripts]
|
||||
pyright = "inv check-pyright --args '{args}'"
|
||||
all = "inv checkall --args '{args}'"
|
||||
|
||||
[tool.black]
|
||||
line-length = 88
|
||||
target-version = ['py313']
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py313"
|
||||
line-length = 88
|
||||
src = ["src"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
"A", # flake8-builtins
|
||||
# "ANN", # flake8-annotations
|
||||
"ARG", # flake8-unused-arguments
|
||||
# "ASYNC", # flake8-async
|
||||
"B", # flake8-bugbear
|
||||
# "BLE", # flake8-blind-except
|
||||
"C4", # flake8-comprehensions
|
||||
"C90", # mccabe
|
||||
# "COM", # flake8-commas
|
||||
# "D", # pydocstyle
|
||||
# "DJ", # flake8-django
|
||||
"DTZ", # flake8-datetimez
|
||||
"E", # pycodestyle (default)
|
||||
"EM", # flake8-errmsg
|
||||
# "ERA", # eradicate
|
||||
# "EXE", # flake8-executable
|
||||
"F", # Pyflakes (default)
|
||||
# "FA", # flake8-future-annotations
|
||||
"FBT", # flake8-boolean-trap
|
||||
# "FLY", # flynt
|
||||
# "G", # flake8-logging-format
|
||||
"I", # isort
|
||||
"ICN", # flake8-import-conventions
|
||||
# "INP", # flake8-no-pep420
|
||||
# "INT", # flake8-gettext
|
||||
"ISC", # flake8-implicit-str-concat
|
||||
"N", # pep8-naming
|
||||
# "NPY", # NumPy-specific rules
|
||||
# "PD", # pandas-vet
|
||||
# "PGH", # pygrep-hooks
|
||||
# "PIE", # flake8-pie
|
||||
# "PL", # Pylint
|
||||
"PLC", # Pylint: Convention
|
||||
"PLE", # Pylint: Error
|
||||
"PLR", # Pylint: Refactor
|
||||
"PLW", # Pylint: Warning
|
||||
# "PT", # flake8-pytest-style
|
||||
# "PTH", # flake8-use-pathlib
|
||||
# "PYI", # flake8-pyi
|
||||
"Q", # flake8-quotes
|
||||
# "RET", # flake8-return
|
||||
# "RSE", # flake8-raise
|
||||
"RUF", # Ruff-specific rules
|
||||
"S", # flake8-bandit
|
||||
# "SIM", # flake8-simplify
|
||||
# "SLF", # flake8-self
|
||||
"T10", # flake8-debugger
|
||||
"T20", # flake8-print
|
||||
# "TCH", # flake8-type-checking
|
||||
# "TD", # flake8-todos
|
||||
"TID", # flake8-tidy-imports
|
||||
# "TRY", # tryceratops
|
||||
"UP", # pyupgrade
|
||||
"W", # pycodestyle
|
||||
"YTT", # flake8-2020
|
||||
]
|
||||
ignore = [
|
||||
# Allow non-abstract empty methods in abstract base classes
|
||||
"B027",
|
||||
# Remove flake8-errmsg since we consider they bloat the code and provide limited value
|
||||
"EM",
|
||||
# Allow boolean positional values in function calls, like `dict.get(... True)`
|
||||
"FBT003",
|
||||
# Ignore checks for possible passwords
|
||||
"S105", "S106", "S107",
|
||||
# Ignore warnings on subprocess.run / popen
|
||||
"S603",
|
||||
# Ignore complexity
|
||||
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
|
||||
]
|
||||
unfixable = [
|
||||
# Don't touch unused imports
|
||||
"F401",
|
||||
]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
known-first-party = ["zimit"]
|
||||
|
||||
[tool.ruff.lint.flake8-bugbear]
|
||||
# add exceptions to B008 for fastapi.
|
||||
extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
|
||||
|
||||
[tool.ruff.lint.flake8-tidy-imports]
|
||||
ban-relative-imports = "all"
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
# Tests can use magic values, assertions, and relative imports
|
||||
"tests**/**/*" = ["PLR2004", "S101", "TID252"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = "7.3"
|
||||
testpaths = ["tests"]
|
||||
pythonpath = [".", "src"]
|
||||
|
||||
[tool.coverage.paths]
|
||||
zimit = ["src/zimit"]
|
||||
tests = ["tests"]
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["zimit"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/zimit/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
"no cov",
|
||||
"if __name__ == .__main__.:",
|
||||
"if TYPE_CHECKING:",
|
||||
]
|
||||
|
||||
[tool.pyright]
|
||||
include = ["src", "tests", "tasks.py"]
|
||||
exclude = [".env/**", ".venv/**"]
|
||||
extraPaths = ["src"]
|
||||
pythonVersion = "3.13"
|
||||
typeCheckingMode="basic"
|
||||
33
setup.py
33
setup.py
|
|
@ -1,33 +0,0 @@
|
|||
import os
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
here = os.path.abspath(os.path.dirname(__file__))
|
||||
|
||||
with open(os.path.join(here, 'README.rst')) as f:
|
||||
README = f.read()
|
||||
|
||||
|
||||
setup(name='zimit',
|
||||
version=0.1,
|
||||
description='zimit',
|
||||
long_description=README,
|
||||
classifiers=[
|
||||
"Programming Language :: Python",
|
||||
"Framework :: Pylons",
|
||||
"Topic :: Internet :: WWW/HTTP",
|
||||
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application"
|
||||
],
|
||||
keywords="web services",
|
||||
author='',
|
||||
author_email='',
|
||||
url='',
|
||||
packages=find_packages(),
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
install_requires=['cornice', 'waitress', 'rq', 'colander',
|
||||
'python-slugify', 'pyramid_mailer'],
|
||||
entry_points="""\
|
||||
[paste.app_factory]
|
||||
main=zimit:main
|
||||
""",
|
||||
paster_plugins=['pyramid'])
|
||||
1
src/zimit/__about__.py
Normal file
1
src/zimit/__about__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
__version__ = "3.0.6-dev0"
|
||||
11
src/zimit/constants.py
Normal file
11
src/zimit/constants.py
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
import logging
|
||||
|
||||
from zimscraperlib.logging import getLogger
|
||||
|
||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||
EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
|
||||
EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
|
||||
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||
REQUESTS_TIMEOUT = 10
|
||||
|
||||
logger = getLogger(name="zimit", level=logging.INFO)
|
||||
14
src/zimit/utils.py
Normal file
14
src/zimit/utils.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from zimit.constants import REQUESTS_TIMEOUT
|
||||
|
||||
|
||||
def download_file(url: str, fpath: Path):
|
||||
"""Download file from url to fpath with streaming"""
|
||||
with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
|
||||
resp.raise_for_status()
|
||||
with open(fpath, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
1261
src/zimit/zimit.py
Executable file
1261
src/zimit/zimit.py
Executable file
File diff suppressed because it is too large
Load diff
109
tasks.py
Normal file
109
tasks.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
# pyright: strict, reportUntypedFunctionDecorator=false
|
||||
import os
|
||||
|
||||
from invoke.context import Context
|
||||
from invoke.tasks import task # pyright: ignore [reportUnknownVariableType]
|
||||
|
||||
use_pty = not os.getenv("CI", "")
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "pytest additional arguments"})
|
||||
def test(ctx: Context, args: str = ""):
|
||||
"""run tests (without coverage)"""
|
||||
ctx.run(f"pytest {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "pytest additional arguments"})
|
||||
def test_cov(ctx: Context, args: str = ""):
|
||||
"""run test vith coverage"""
|
||||
ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["html"], help={"html": "flag to export html report"})
|
||||
def report_cov(ctx: Context, *, html: bool = False):
|
||||
"""report coverage"""
|
||||
ctx.run("coverage combine", warn=True, pty=use_pty)
|
||||
ctx.run("coverage report --show-missing", pty=use_pty)
|
||||
if html:
|
||||
ctx.run("coverage html", pty=use_pty)
|
||||
|
||||
|
||||
@task(
|
||||
optional=["args", "html"],
|
||||
help={
|
||||
"args": "pytest additional arguments",
|
||||
"html": "flag to export html report",
|
||||
},
|
||||
)
|
||||
def coverage(ctx: Context, args: str = "", *, html: bool = False):
|
||||
"""run tests and report coverage"""
|
||||
test_cov(ctx, args=args)
|
||||
report_cov(ctx, html=html)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "black additional arguments"})
|
||||
def lint_black(ctx: Context, args: str = "."):
|
||||
args = args or "." # needed for hatch script
|
||||
ctx.run("black --version", pty=use_pty)
|
||||
ctx.run(f"black --check --diff {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "ruff additional arguments"})
|
||||
def lint_ruff(ctx: Context, args: str = "."):
|
||||
args = args or "." # needed for hatch script
|
||||
ctx.run("ruff --version", pty=use_pty)
|
||||
ctx.run(f"ruff check {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(
|
||||
optional=["args"],
|
||||
help={
|
||||
"args": "linting tools (black, ruff) additional arguments, typically a path",
|
||||
},
|
||||
)
|
||||
def lintall(ctx: Context, args: str = "."):
|
||||
"""Check linting"""
|
||||
args = args or "." # needed for hatch script
|
||||
lint_black(ctx, args)
|
||||
lint_ruff(ctx, args)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
|
||||
def check_pyright(ctx: Context, args: str = ""):
|
||||
"""check static types with pyright"""
|
||||
ctx.run("pyright --version")
|
||||
ctx.run(f"pyright {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
|
||||
def checkall(ctx: Context, args: str = ""):
|
||||
"""check static types"""
|
||||
check_pyright(ctx, args)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "black additional arguments"})
|
||||
def fix_black(ctx: Context, args: str = "."):
|
||||
"""fix black formatting"""
|
||||
args = args or "." # needed for hatch script
|
||||
ctx.run(f"black {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "ruff additional arguments"})
|
||||
def fix_ruff(ctx: Context, args: str = "."):
|
||||
"""fix all ruff rules"""
|
||||
args = args or "." # needed for hatch script
|
||||
ctx.run(f"ruff check --fix {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(
|
||||
optional=["args"],
|
||||
help={
|
||||
"args": "linting tools (black, ruff) additional arguments, typically a path",
|
||||
},
|
||||
)
|
||||
def fixall(ctx: Context, args: str = "."):
|
||||
"""Fix everything automatically"""
|
||||
args = args or "." # needed for hatch script
|
||||
fix_black(ctx, args)
|
||||
fix_ruff(ctx, args)
|
||||
lintall(ctx, args)
|
||||
75
tests-daily/Dockerfile
Normal file
75
tests-daily/Dockerfile
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
# Let's extract kiwix-tools as usual on alpine temporary build container
|
||||
FROM alpine:3.21 as kiwix-serve
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools
|
||||
|
||||
# TARGETPLATFORM is injected by docker build
|
||||
ARG TARGETPLATFORM
|
||||
ARG KIWIX_TOOLS_VERSION
|
||||
|
||||
RUN set -e && \
|
||||
# default (no KIWIX_TOOLS_VERSION set) to today's nightly
|
||||
if [ -z "$KIWIX_TOOLS_VERSION" ] ; then KIWIX_TOOLS_VERSION=$(date +"%Y-%m-%d") ; fi && \
|
||||
apk --no-cache add dumb-init curl && \
|
||||
echo "TARGETPLATFORM: $TARGETPLATFORM" && \
|
||||
if [ "$TARGETPLATFORM" = "linux/386" ]; then ARCH="i586"; \
|
||||
# linux/arm64/v8 points to linux/arm64
|
||||
elif [ "$TARGETPLATFORM" = "linux/arm64/v8" \
|
||||
-o "$TARGETPLATFORM" = "linux/arm64" ]; then ARCH="aarch64"; \
|
||||
# linux/arm translates to linux/arm/v7
|
||||
elif [ "$TARGETPLATFORM" = "linux/arm/v7" ]; then ARCH="armv8"; \
|
||||
elif [ "$TARGETPLATFORM" = "linux/arm/v6" ]; then ARCH="armv6"; \
|
||||
elif [ "$TARGETPLATFORM" = "linux/amd64/v3" \
|
||||
-o "$TARGETPLATFORM" = "linux/amd64/v2" \
|
||||
-o "$TARGETPLATFORM" = "linux/amd64" ]; then ARCH="x86_64"; \
|
||||
# we dont suppot any other arch so let it fail
|
||||
else ARCH="unknown"; fi && \
|
||||
# download requested kiwix-tools version
|
||||
url="http://mirror.download.kiwix.org/nightly/$KIWIX_TOOLS_VERSION/kiwix-tools_linux-$ARCH-$KIWIX_TOOLS_VERSION.tar.gz" && \
|
||||
echo "URL: $url" && \
|
||||
mkdir /kiwix-serve && \
|
||||
curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1
|
||||
|
||||
# Build real "workload" container
|
||||
FROM python:3.13-slim-bookworm
|
||||
|
||||
# Add kiwix-serve
|
||||
COPY --from=kiwix-serve /kiwix-serve /usr/local/bin
|
||||
|
||||
# Update apt + install dependencies + install Google Chrome dependencies + clean-up apt lists
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -qqy wget xvfb unzip jq && \
|
||||
apt-get install -qqy libxss1 libappindicator1 libgconf-2-4 \
|
||||
fonts-liberation libasound2 libnspr4 libnss3 libx11-xcb1 libxtst6 lsb-release xdg-utils \
|
||||
libgbm1 libnss3 libatk-bridge2.0-0 libgtk-3-0 libx11-xcb1 libxcb-dri3-0 && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Fetch the latest version numbers and URLs for Chrome and ChromeDriver
|
||||
RUN wget -q -O /tmp/versions.json https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json
|
||||
|
||||
# Install chrome
|
||||
RUN CHROME_URL=$(jq -r '.channels.Stable.downloads.chrome[] | select(.platform=="linux64") | .url' /tmp/versions.json) && \
|
||||
wget -q --continue -O /tmp/chrome-linux64.zip $CHROME_URL && \
|
||||
unzip /tmp/chrome-linux64.zip -d /opt/chrome
|
||||
|
||||
RUN chmod +x /opt/chrome/chrome-linux64/chrome
|
||||
|
||||
# Install chromedriver
|
||||
RUN CHROMEDRIVER_URL=$(jq -r '.channels.Stable.downloads.chromedriver[] | select(.platform=="linux64") | .url' /tmp/versions.json) && \
|
||||
wget -q --continue -O /tmp/chromedriver-linux64.zip $CHROMEDRIVER_URL && \
|
||||
unzip /tmp/chromedriver-linux64.zip -d /opt/chromedriver && \
|
||||
chmod +x /opt/chromedriver/chromedriver-linux64/chromedriver
|
||||
|
||||
# Set up Chromedriver Environment variables
|
||||
ENV CHROMEDRIVER_DIR /opt/chromedriver
|
||||
ENV PATH $CHROMEDRIVER_DIR:$PATH
|
||||
|
||||
# Clean up
|
||||
RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json
|
||||
|
||||
# Update pip, install selenium, create work directory
|
||||
RUN \
|
||||
python -m pip install --no-cache-dir -U \
|
||||
pip \
|
||||
selenium==4.28.1 \
|
||||
pytest==8.3.4 \
|
||||
&& mkdir -p /work
|
||||
128
tests-daily/daily.py
Normal file
128
tests-daily/daily.py
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from time import sleep
|
||||
|
||||
import pytest
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
KIWIX_SERVE_START_SLEEP = 1
|
||||
|
||||
ZIM_NAME = "tests_eng_test-website"
|
||||
YOUTUBE_VIDEO_PATH = "youtube.fuzzy.replayweb.page/embed/g5skcrNXdDM"
|
||||
|
||||
SKIP_YOUTUBE_TEST = os.getenv("SKIP_YOUTUBE_TEST", "False").lower() == "true"
|
||||
|
||||
CHECK_VIDEO_IS_PLAYING_AFTER_SECS = 30
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def chrome_driver():
|
||||
"""Start chrome and setup chrome driver / selenium"""
|
||||
|
||||
logger.info("Starting Chrome")
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
# Other options of interest:
|
||||
# --disable-dev-shm-usage (not needed anymore with recent chrome versions)
|
||||
# --disable-gpu (important for some versions of Chrome)
|
||||
# --remote-debugging-port=9222 (should you need to remote debug)
|
||||
|
||||
# Set path to Chrome binary
|
||||
chrome_options.binary_location = "/opt/chrome/chrome-linux64/chrome"
|
||||
|
||||
# Set path to ChromeDriver
|
||||
chrome_service = ChromeService(
|
||||
executable_path="/opt/chromedriver/chromedriver-linux64/chromedriver"
|
||||
)
|
||||
|
||||
# Set up driver
|
||||
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
|
||||
|
||||
yield driver
|
||||
|
||||
# Cleanup
|
||||
logger.info("Quitting Chrome")
|
||||
driver.quit()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def kiwix_serve():
|
||||
"""Start kiwix-serve with given ZIM"""
|
||||
|
||||
logger.info("Starting kiwix-serve")
|
||||
process = subprocess.Popen(
|
||||
[
|
||||
"/usr/bin/env",
|
||||
"/usr/local/bin/kiwix-serve",
|
||||
f"/output/{ZIM_NAME}.zim",
|
||||
]
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Waiting {KIWIX_SERVE_START_SLEEP} secs to be 'sure' that kiwix-serve is ready"
|
||||
)
|
||||
sleep(KIWIX_SERVE_START_SLEEP)
|
||||
|
||||
if process.poll() is not None:
|
||||
raise Exception("kiwix-serve has terminated too early")
|
||||
|
||||
yield process
|
||||
|
||||
# Cleanup
|
||||
logger.info("Quitting kiwix-serve")
|
||||
process.terminate()
|
||||
|
||||
|
||||
@pytest.mark.skipif(SKIP_YOUTUBE_TEST, reason="Youtube test disabled by environment")
|
||||
def test_youtube_video(chrome_driver, kiwix_serve): # noqa: ARG001
|
||||
"""Test that youtube video loads, and still plays after a while"""
|
||||
|
||||
chrome_driver.get(f"http://localhost:80/content/{ZIM_NAME}/{YOUTUBE_VIDEO_PATH}")
|
||||
|
||||
if chrome_driver.title == "Content not found":
|
||||
raise Exception("Wrong URL, kiwix-serve said that content is not found")
|
||||
|
||||
button = WebDriverWait(chrome_driver, 1).until(
|
||||
expected_conditions.presence_of_element_located(
|
||||
(By.XPATH, "//button[@title='Play']")
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Play button found in page")
|
||||
|
||||
button.click()
|
||||
|
||||
video = WebDriverWait(chrome_driver, 1).until(
|
||||
expected_conditions.presence_of_element_located((By.TAG_NAME, "video"))
|
||||
)
|
||||
|
||||
logger.info("Video found in page")
|
||||
|
||||
# arguments[0] is the video tag passed to execute_script
|
||||
if not chrome_driver.execute_script("return arguments[0].paused === false", video):
|
||||
raise Exception("Video is not playing, failed to start probably")
|
||||
|
||||
logger.info("Video is playing")
|
||||
|
||||
logger.info(
|
||||
f"Waiting {CHECK_VIDEO_IS_PLAYING_AFTER_SECS} secs to check video is still "
|
||||
"playing"
|
||||
)
|
||||
sleep(CHECK_VIDEO_IS_PLAYING_AFTER_SECS)
|
||||
|
||||
# arguments[0] is the video tag passed to execute_script
|
||||
if not chrome_driver.execute_script("return arguments[0].paused === false", video):
|
||||
raise Exception(
|
||||
"Video is not playing anymore after "
|
||||
f"{CHECK_VIDEO_IS_PLAYING_AFTER_SECS} secs"
|
||||
)
|
||||
logger.info("Video is still playing")
|
||||
1
tests-integration/README.md
Normal file
1
tests-integration/README.md
Normal file
|
|
@ -0,0 +1 @@
|
|||
These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)
|
||||
145
tests-integration/integration.py
Normal file
145
tests-integration/integration.py
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
import glob
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from warcio import ArchiveIterator
|
||||
from zimscraperlib.zim import Archive
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/tests_en_onepage.zim", id="onepage"),
|
||||
pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
|
||||
pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
|
||||
],
|
||||
)
|
||||
def test_zim_created(filename):
|
||||
"""Ensure ZIM file exists"""
|
||||
assert os.path.isfile(filename)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
|
||||
pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
|
||||
],
|
||||
)
|
||||
def test_zim_not_created(filename):
|
||||
"""Ensure ZIM file does not exists"""
|
||||
assert not os.path.exists(filename)
|
||||
|
||||
|
||||
def test_zim_main_page():
|
||||
"""Main page specified, http://website.test.openzim.org/http-return-codes.html,
|
||||
was a redirect to https
|
||||
Ensure main page is the redirected page"""
|
||||
|
||||
main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry
|
||||
assert main_entry.is_redirect
|
||||
assert (
|
||||
main_entry.get_redirect_entry().path
|
||||
== "website.test.openzim.org/http-return-codes.html"
|
||||
)
|
||||
|
||||
|
||||
def test_zim_scraper():
|
||||
"""Check content of scraper metadata"""
|
||||
|
||||
zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
|
||||
scraper = zim_fh.get_text_metadata("Scraper")
|
||||
assert "zimit " in scraper
|
||||
assert "warc2zim " in scraper
|
||||
assert "Browsertrix-Crawler " in scraper
|
||||
|
||||
|
||||
def test_files_list():
|
||||
"""Check that expected files are present in the ZIM at proper path"""
|
||||
zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
|
||||
for expected_entry in [
|
||||
"_zim_static/__wb_module_decl.js",
|
||||
"_zim_static/wombat.js",
|
||||
"_zim_static/wombatSetup.js",
|
||||
"website.test.openzim.org/http-return-codes.html",
|
||||
"website.test.openzim.org/200-response",
|
||||
"website.test.openzim.org/201-response",
|
||||
"website.test.openzim.org/202-response",
|
||||
"website.test.openzim.org/301-external-redirect-ok",
|
||||
"website.test.openzim.org/301-internal-redirect-ok",
|
||||
"website.test.openzim.org/302-external-redirect-ok",
|
||||
"website.test.openzim.org/302-internal-redirect-ok",
|
||||
"website.test.openzim.org/307-external-redirect-ok",
|
||||
"website.test.openzim.org/307-internal-redirect-ok",
|
||||
"website.test.openzim.org/308-external-redirect-ok",
|
||||
"website.test.openzim.org/308-internal-redirect-ok",
|
||||
"website.test.openzim.org/http-return-codes.html",
|
||||
"website.test.openzim.org/icons/favicon.ico",
|
||||
"website.test.openzim.org/icons/site.webmanifest",
|
||||
"website.test.openzim.org/internal_redirect_target.html",
|
||||
"www.example.com/",
|
||||
]:
|
||||
assert zim_fh.get_content(expected_entry)
|
||||
|
||||
|
||||
def test_user_agent():
|
||||
"""Test that mobile user agent was used
|
||||
|
||||
Check is done in WARC request records with custom Zimit and email suffix
|
||||
"""
|
||||
|
||||
found = False
|
||||
for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
|
||||
with open(warc, "rb") as fh:
|
||||
for record in ArchiveIterator(fh):
|
||||
if record.rec_type == "request":
|
||||
print(record.http_headers) # noqa: T201
|
||||
ua = record.http_headers.get_header("User-Agent")
|
||||
if ua:
|
||||
assert "Mozilla" in ua
|
||||
assert ua.endswith(" +Zimit test@example.com")
|
||||
found = True
|
||||
|
||||
# should find at least one
|
||||
assert found
|
||||
|
||||
|
||||
def test_stats_output_standard():
|
||||
assert json.loads(Path("/output/crawl.json").read_bytes()) == {
|
||||
"crawled": 17,
|
||||
"pending": 0,
|
||||
"pendingPages": [],
|
||||
"total": 35,
|
||||
"failed": 18,
|
||||
"limit": {"max": 0, "hit": False},
|
||||
}
|
||||
|
||||
assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
|
||||
"written": 8,
|
||||
"total": 8,
|
||||
}
|
||||
|
||||
assert json.loads(Path("/output/stats.json").read_bytes()) == {
|
||||
"done": 8,
|
||||
"total": 8,
|
||||
"partialZim": False,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
|
||||
pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
|
||||
],
|
||||
)
|
||||
def test_stats_output_softlimit(filename):
|
||||
file = Path(filename)
|
||||
assert file.exists
|
||||
content = json.loads(file.read_bytes())
|
||||
assert "done" in content
|
||||
assert "total" in content
|
||||
assert "partialZim" in content
|
||||
assert content["partialZim"]
|
||||
14
tests/conftest.py
Normal file
14
tests/conftest.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
import pytest
|
||||
|
||||
from zimit import zimit as app
|
||||
|
||||
"""
|
||||
cleanup disabled because atexit hooks run at the very end of the Python process
|
||||
shutdown. By the time cleanup() is called, the logging module has already closed its
|
||||
file streams.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def disable_zimit_cleanup(monkeypatch):
|
||||
monkeypatch.setattr(app, "cleanup", lambda: None)
|
||||
BIN
tests/data/example-response.warc
Normal file
BIN
tests/data/example-response.warc
Normal file
Binary file not shown.
6
tests/test_dummy.py
Normal file
6
tests/test_dummy.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
|
||||
|
||||
|
||||
# dummy test, just to have coverage report done
|
||||
def test_something_exists():
|
||||
assert NORMAL_WARC2ZIM_EXIT_CODE
|
||||
83
tests/test_overwrite.py
Normal file
83
tests/test_overwrite.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
from zimit.zimit import run
|
||||
|
||||
TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
|
||||
|
||||
|
||||
def test_overwrite_flag_behaviour(tmp_path):
|
||||
zim_output = "overwrite-test.zim"
|
||||
output_path = tmp_path / zim_output
|
||||
|
||||
# 1st run → creates file
|
||||
result = run(
|
||||
[
|
||||
"--seeds",
|
||||
"https://example.com",
|
||||
"--warcs",
|
||||
str(TEST_DATA_DIR / "example-response.warc"),
|
||||
"--output",
|
||||
str(tmp_path),
|
||||
"--zim-file",
|
||||
zim_output,
|
||||
"--name",
|
||||
"overwrite-test",
|
||||
]
|
||||
)
|
||||
assert result in (None, 100)
|
||||
assert output_path.exists()
|
||||
|
||||
# 2nd run, no overwrite → should fail
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
run(
|
||||
[
|
||||
"--seeds",
|
||||
"https://example.com",
|
||||
"--warcs",
|
||||
str(TEST_DATA_DIR / "example-response.warc"),
|
||||
"--output",
|
||||
str(tmp_path),
|
||||
"--zim-file",
|
||||
zim_output,
|
||||
"--name",
|
||||
"overwrite-test",
|
||||
]
|
||||
)
|
||||
assert exc.value.code == 2
|
||||
|
||||
# 2nd run, no overwrite → should fail
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
run(
|
||||
[
|
||||
"--seeds",
|
||||
"https://example.com",
|
||||
"--output",
|
||||
str(tmp_path),
|
||||
"--zim-file",
|
||||
zim_output,
|
||||
"--name",
|
||||
"overwrite-test",
|
||||
]
|
||||
)
|
||||
assert exc.value.code == 2
|
||||
|
||||
# 3rd run, with overwrite → should succeed
|
||||
result = run(
|
||||
[
|
||||
"--seeds",
|
||||
"https://example.com",
|
||||
"--warcs",
|
||||
str(TEST_DATA_DIR / "example-response.warc"),
|
||||
"--output",
|
||||
str(tmp_path),
|
||||
"--zim-file",
|
||||
zim_output,
|
||||
"--name",
|
||||
"overwrite-test",
|
||||
"--overwrite",
|
||||
]
|
||||
)
|
||||
assert result in (None, 100)
|
||||
assert output_path.exists()
|
||||
62
zimit.ini
62
zimit.ini
|
|
@ -1,62 +0,0 @@
|
|||
[app:main]
|
||||
use = egg:zimit
|
||||
|
||||
zimit.zimwriterfs_bin = /home/alexis/dev/openzim/zimwriterfs/zimwriterfs
|
||||
zimit.httrack_bin = /usr/bin/httrack
|
||||
zimit.output_location = /home/alexis/dev/zimit/zims
|
||||
zimit.output_url = http://zimit.notmyidea.org/zims
|
||||
|
||||
mail.host = localhost
|
||||
mail.port = 2525
|
||||
mail.default_sender = zimit@notmyidea.org
|
||||
|
||||
pyramid.includes =
|
||||
pyramid_mailer
|
||||
|
||||
[server:main]
|
||||
use = egg:waitress#main
|
||||
host = 0.0.0.0
|
||||
port = 6543
|
||||
|
||||
# Begin logging configuration
|
||||
|
||||
[uwsgi]
|
||||
wsgi-file = app.wsgi
|
||||
http-socket = :8000
|
||||
enable-threads = true
|
||||
master = true
|
||||
processes = 1
|
||||
virtualenv = .
|
||||
module = zimit
|
||||
lazy = true
|
||||
lazy-apps = true
|
||||
|
||||
|
||||
[loggers]
|
||||
keys = root, gplayproxy
|
||||
|
||||
[handlers]
|
||||
keys = console
|
||||
|
||||
[formatters]
|
||||
keys = generic
|
||||
|
||||
[logger_root]
|
||||
level = INFO
|
||||
handlers = console
|
||||
|
||||
[logger_gplayproxy]
|
||||
level = DEBUG
|
||||
handlers =
|
||||
qualname = gplayproxy
|
||||
|
||||
[handler_console]
|
||||
class = StreamHandler
|
||||
args = (sys.stderr,)
|
||||
level = NOTSET
|
||||
formatter = generic
|
||||
|
||||
[formatter_generic]
|
||||
format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
|
||||
|
||||
# End logging configuration
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
from pyramid.config import Configurator
|
||||
from pyramid.events import NewRequest
|
||||
from pyramid.static import static_view
|
||||
|
||||
from redis import Redis
|
||||
from rq import Queue
|
||||
|
||||
|
||||
def main(global_config, **settings):
|
||||
config = Configurator(settings=settings)
|
||||
config.registry.queue = Queue(connection=Redis())
|
||||
|
||||
def attach_objects_to_request(event):
|
||||
event.request.queue = config.registry.queue
|
||||
|
||||
config.add_subscriber(attach_objects_to_request, NewRequest)
|
||||
|
||||
config.include("cornice")
|
||||
config.include('pyramid_mailer')
|
||||
config.scan("zimit.views")
|
||||
|
||||
static = static_view('../app', use_subpath=True, index='index.html')
|
||||
config.add_route('catchall_static', '/app/*subpath')
|
||||
config.add_view(static, route_name="catchall_static")
|
||||
return config.make_wsgi_app()
|
||||
146
zimit/creator.py
146
zimit/creator.py
|
|
@ -1,146 +0,0 @@
|
|||
import os
|
||||
import os.path
|
||||
import shutil
|
||||
import tempfile
|
||||
import urlparse
|
||||
|
||||
from slugify import slugify
|
||||
|
||||
from zimit import utils
|
||||
|
||||
HTTRACK_BIN = "/usr/bin/httrack"
|
||||
DEFAULT_AUTHOR = "ZimIt"
|
||||
|
||||
|
||||
class ZimCreator(object):
|
||||
"""A synchronous zim creator, using HTTrack to spider websites and
|
||||
zimwriterfs to create the zim files.
|
||||
|
||||
Please note that every operation is blocking the interpretor. As such, it
|
||||
is recommended to run this operation in a worker if invoked from a website
|
||||
view / controller.
|
||||
"""
|
||||
|
||||
def __init__(self, zimwriterfs_bin, output_location,
|
||||
author=DEFAULT_AUTHOR, httrack_bin=HTTRACK_BIN,
|
||||
log_file=None, max_download_speed=25000):
|
||||
self.output_location = output_location
|
||||
self.author = author
|
||||
self.zimwriterfs_bin = zimwriterfs_bin
|
||||
self.httrack_bin = httrack_bin
|
||||
self.log_file = log_file
|
||||
self.max_download_speed = max_download_speed
|
||||
|
||||
utils.ensure_paths_exists(
|
||||
self.zimwriterfs_bin,
|
||||
self.httrack_bin,
|
||||
self.output_location)
|
||||
|
||||
def _spawn(self, cmd):
|
||||
return utils.spawn(cmd, self.log_file)
|
||||
|
||||
def download_website(self, url, destination_path):
|
||||
"""Downloads the website using HTTrack and wait for the results to
|
||||
be available before returning.
|
||||
|
||||
:param url:
|
||||
The entry URL of the website to retrieve.
|
||||
|
||||
:param destination_path:
|
||||
The absolute location of a folder where the files will be written.
|
||||
"""
|
||||
options = {
|
||||
"path": destination_path,
|
||||
"max-rate": self.max_download_speed,
|
||||
"keep-alive": None,
|
||||
"robots": 0,
|
||||
"near": None,
|
||||
}
|
||||
|
||||
self._spawn(utils.get_command(self.httrack_bin, url, **options))
|
||||
|
||||
def prepare_website_folder(self, url, input_location):
|
||||
"""Prepare the website files to make them ready to be embedded in a zim
|
||||
file.
|
||||
|
||||
:returns:
|
||||
the absolute location of the website folder, ready to be embedded.
|
||||
"""
|
||||
netloc = urlparse.urlparse(url).netloc.replace(":", "_")
|
||||
website_folder = os.path.join(input_location, netloc)
|
||||
if not os.path.isdir(website_folder):
|
||||
message = "Unable to find the website folder! %s" % website_folder
|
||||
raise Exception(message)
|
||||
shutil.copy('./favicon.ico', website_folder)
|
||||
return website_folder
|
||||
|
||||
def create_zim(self, input_location, output_name, zim_options):
|
||||
"""Create a zim file out of an existing folder on disk.
|
||||
|
||||
:param input_location:
|
||||
The absolute location of the files to be bundled in the zim file.
|
||||
:param output_name:
|
||||
The name to use to create the zim file.
|
||||
:param options:
|
||||
Options to pass to the zim creator.
|
||||
"""
|
||||
|
||||
zim_options.update({
|
||||
'bin': self.zimwriterfs_bin,
|
||||
'location': input_location,
|
||||
'output': os.path.join(self.output_location, output_name),
|
||||
'icon': 'favicon.ico',
|
||||
'publisher': self.author,
|
||||
})
|
||||
|
||||
# Spawn zimwriterfs with the correct options.
|
||||
options = (
|
||||
'{bin} -w "{welcome}" -l "{language}" -t "{title}"'
|
||||
' -d "{description}" -f {icon} -c "{author}"'
|
||||
' -p "{publisher}" {location} {output}'
|
||||
).format(**zim_options)
|
||||
self._spawn(options)
|
||||
return output_name
|
||||
|
||||
def create_zim_from_website(self, url, zim_options):
|
||||
"""Create a zim file from a website. It might take some time.
|
||||
|
||||
The name of the generated zim file is a slugified version of its URL.
|
||||
|
||||
:param url:
|
||||
the URL of the website to download.
|
||||
|
||||
:param zim_options:
|
||||
A dictionary of options to use when generating the Zim file. They
|
||||
are title, language, welcome and description.
|
||||
|
||||
:returns:
|
||||
the name of the generated zim_file (relative to the output_folder)
|
||||
"""
|
||||
temporary_location = tempfile.mkdtemp("zimit")
|
||||
self.download_website(url, temporary_location)
|
||||
website_folder = self.prepare_website_folder(url, temporary_location)
|
||||
output_name = "{slug}.zim".format(slug=slugify(url))
|
||||
zim_file = self.create_zim(website_folder, output_name, zim_options)
|
||||
return zim_file
|
||||
|
||||
|
||||
def load_from_settings(settings, log_file=None):
|
||||
"""Load the ZimCreator object from the given pyramid settings, converting
|
||||
them to actual parameters.
|
||||
|
||||
This is a convenience function for people wanting to create a ZimCreator
|
||||
out of a ini file compatible with the pyramid framework.
|
||||
|
||||
:param settings: the dictionary of settings.
|
||||
"""
|
||||
if 'zimit.zimwriterfs_bin' not in settings:
|
||||
raise ValueError('Please define zimit.zimwriterfs_bin config.')
|
||||
|
||||
return ZimCreator(
|
||||
zimwriterfs_bin=settings['zimit.zimwriterfs_bin'],
|
||||
httrack_bin=settings.get('zimit.httrack_bin'),
|
||||
output_location=settings.get('zimit.output_location'),
|
||||
author=settings.get('zimit.default_author'),
|
||||
log_file=log_file
|
||||
)
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
from pyramid_mailer.message import Attachment, Message
|
||||
from pyramid_mailer import Mailer
|
||||
|
||||
|
||||
def send_zim_url(settings, email, zim_url):
|
||||
"""Send an email with a link to one zim file.
|
||||
|
||||
:param settings:
|
||||
A pyramid settings object, used by pyramid_mailer.
|
||||
:param email:
|
||||
The email of the recipient.
|
||||
:param zim_url:
|
||||
The URL of the zim file.
|
||||
"""
|
||||
mailer = Mailer.from_settings(settings)
|
||||
msg = ZimReadyMessage(email, zim_url)
|
||||
mailer.send_immediately(msg)
|
||||
|
||||
|
||||
class ZimReadyMessage(Message):
|
||||
def __init__(self, email, zim_link):
|
||||
subject = "[ZimIt!] Your zimfile is ready!"
|
||||
|
||||
bdata = """
|
||||
Hi,
|
||||
|
||||
You have asked for the creation of a zim file, and it is now ready !
|
||||
|
||||
You can access it at the following URL:
|
||||
|
||||
{zim_link}
|
||||
|
||||
Cheers,
|
||||
ZimIt.
|
||||
""".format(zim_link=zim_link)
|
||||
hdata = bdata
|
||||
|
||||
body = Attachment(data=bdata, transfer_encoding="quoted-printable")
|
||||
html = Attachment(data=hdata, transfer_encoding="quoted-printable")
|
||||
|
||||
super(ZimReadyMessage, self).__init__(
|
||||
subject=subject, body=body, html=html, recipients=[email])
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
import os
|
||||
import shlex
|
||||
import subprocess
|
||||
|
||||
|
||||
def spawn(cmd, logfile=None):
|
||||
"""Quick shortcut to spawn a command on the filesystem"""
|
||||
if logfile is not None:
|
||||
with open(logfile, "a+") as f:
|
||||
prepared_cmd = shlex.split("stdbuf -o0 %s" % cmd)
|
||||
process = subprocess.Popen(prepared_cmd, stdout=f)
|
||||
else:
|
||||
prepared_cmd = shlex.split(cmd)
|
||||
process = subprocess.Popen(prepared_cmd)
|
||||
process.wait()
|
||||
return process
|
||||
|
||||
|
||||
def ensure_paths_exists(*paths):
|
||||
for path in paths:
|
||||
if not os.path.exists(path):
|
||||
msg = '%s does not exist.' % path
|
||||
raise OSError(msg)
|
||||
|
||||
|
||||
def get_command(cmd, *params, **options):
|
||||
prepared_options = []
|
||||
for key, value in options.items():
|
||||
if value is None:
|
||||
opt = "--%s" % key
|
||||
else:
|
||||
opt = "--%s=%s" % (key, value)
|
||||
prepared_options.append(opt)
|
||||
|
||||
return " ".join((cmd, " ".join(params), " ".join(prepared_options)))
|
||||
|
|
@ -1,63 +0,0 @@
|
|||
import os
|
||||
|
||||
from cornice import Service
|
||||
from colander import MappingSchema, SchemaNode, String
|
||||
from pyramid.httpexceptions import HTTPTemporaryRedirect, HTTPNotFound
|
||||
|
||||
from zimit.worker import create_zim
|
||||
|
||||
website = Service(name='website', path='/website-zim')
|
||||
home = Service(name='home', path='/')
|
||||
status = Service(name='status', path='/status/{id}')
|
||||
|
||||
|
||||
@home.get()
|
||||
def redirect_to_app(request):
|
||||
raise HTTPTemporaryRedirect("/app/index.html")
|
||||
|
||||
|
||||
class WebSiteSchema(MappingSchema):
|
||||
url = SchemaNode(String(), location="body", type='str')
|
||||
title = SchemaNode(String(), location="body", type='str')
|
||||
email = SchemaNode(String(), location="body", type='str')
|
||||
description = SchemaNode(String(), default="-",
|
||||
location="body", type='str')
|
||||
author = SchemaNode(String(), default=None,
|
||||
location="body", type='str')
|
||||
welcome = SchemaNode(String(), default="index.html",
|
||||
location="body", type='str')
|
||||
language = SchemaNode(String(), default="eng",
|
||||
location="body", type='str')
|
||||
|
||||
|
||||
@website.post(schema=WebSiteSchema)
|
||||
def crawl_new_website(request):
|
||||
job = request.queue.enqueue(
|
||||
create_zim,
|
||||
request.registry.settings,
|
||||
request.validated,
|
||||
timeout=1800)
|
||||
request.response.status_code = 201
|
||||
return {
|
||||
'job_id': job.id
|
||||
}
|
||||
|
||||
|
||||
@status.get()
|
||||
def display_status(request):
|
||||
job = request.queue.fetch_job(request.matchdict["id"])
|
||||
if job is None:
|
||||
raise HTTPNotFound()
|
||||
|
||||
log_dir = request.registry.settings.get('zimit.logdir', '/tmp')
|
||||
log_file = os.path.join(log_dir, "%s.log" % job.id)
|
||||
|
||||
log_content = None
|
||||
if os.path.exists(log_file):
|
||||
with open(log_file) as f:
|
||||
log_content = f.read()
|
||||
|
||||
return {
|
||||
"status": job.status,
|
||||
"log": log_content
|
||||
}
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
import os
|
||||
import urlparse
|
||||
|
||||
from rq import get_current_job
|
||||
|
||||
from zimit.mailer import send_zim_url
|
||||
from zimit.creator import load_from_settings
|
||||
|
||||
|
||||
def create_zim(settings, options):
|
||||
"""Call the zim creator and the mailer when it is finished.
|
||||
"""
|
||||
job = get_current_job()
|
||||
log_dir = settings.get('zimit.logdir', '/tmp')
|
||||
log_file = os.path.join(log_dir, "%s.log" % job.id)
|
||||
zim_creator = load_from_settings(settings, log_file)
|
||||
zim_file = zim_creator.create_zim_from_website(options['url'], options)
|
||||
output_url = settings.get('zimit.output_url')
|
||||
zim_url = urlparse.urljoin(output_url, zim_file)
|
||||
send_zim_url(settings, options['email'], zim_url)
|
||||
Loading…
Add table
Add a link
Reference in a new issue