mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 04:23:15 +00:00
Use preferred Browsertrix Crawler arguments and fix multiple/file seeds support
This commit is contained in:
parent
dc6b5aafb7
commit
ed1a8a0aa9
6 changed files with 75 additions and 46 deletions
2
.github/workflows/DailyTests.yaml
vendored
2
.github/workflows/DailyTests.yaml
vendored
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
run: docker build -t local-zimit .
|
||||
|
||||
- name: run crawl of test website
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
|
||||
|
||||
- name: archive ZIM
|
||||
uses: actions/upload-artifact@v4
|
||||
|
|
|
|||
10
.github/workflows/Tests.yaml
vendored
10
.github/workflows/Tests.yaml
vendored
|
|
@ -63,19 +63,19 @@ jobs:
|
|||
run: docker run -v $PWD/output:/output local-zimit zimit --help
|
||||
|
||||
- name: run crawl with soft size limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
|
||||
|
||||
- name: run crawl with hard size limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
|
||||
|
||||
- name: run crawl with soft time limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
|
||||
|
||||
- name: run crawl with hard time limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
|
||||
|
||||
- name: run standard crawl
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
|
||||
|
|
|
|||
|
|
@ -13,6 +13,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
|
||||
- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
|
||||
- Document all Browsertrix Crawler default arguments values (#416)
|
||||
- Use preferred Browsertrix Crawler arguments names:
|
||||
- `--seeds` instead of `--url`
|
||||
- `--seedFile` instead of `--urlFile`
|
||||
- `--pageLimit` instead of `--limit`
|
||||
- `--pageLoadTimeout` instead of `--timeout`
|
||||
- `--scopeIncludeRx` instead of `--include`
|
||||
- `--scopeExcludeRx` instead of `--exclude`
|
||||
- `--pageExtraDelay` instead of `--delay`
|
||||
|
||||
### Fixed
|
||||
|
||||
|
|
|
|||
13
README.md
13
README.md
|
|
@ -38,16 +38,15 @@ Usage
|
|||
|
||||
`zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit.
|
||||
|
||||
The image accepts the following parameters, **as well as any of the [warc2zim](https://github.com/openzim/warc2zim) ones**; useful for setting metadata, for instance:
|
||||
The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**:
|
||||
|
||||
- Required: `--url URL` - the url to be crawled
|
||||
- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage
|
||||
- Required: `--name` - Name of ZIM file
|
||||
- `--output` - output directory (defaults to `/output`)
|
||||
- `--limit U` - Limit capture to at most U URLs
|
||||
- `--behaviors` - Control which browsertrix behaviors are ran (defaults to `autoplay,autofetch,siteSpecific`, adding `autoscroll` to the list is possible to automatically scroll the pages and fetch resources which are lazy loaded)
|
||||
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
|
||||
- `--pageLimit U` - Limit capture to at most U URLs
|
||||
- `--scopeExcludeRx <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
|
||||
- `--workers N` - number of crawl workers to be run in parallel
|
||||
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||
- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||
- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
|
||||
|
||||
Example command:
|
||||
|
|
@ -55,7 +54,7 @@ Example command:
|
|||
```bash
|
||||
docker run ghcr.io/openzim/zimit zimit --help
|
||||
docker run ghcr.io/openzim/zimit warc2zim --help
|
||||
docker run -v /output:/output ghcr.io/openzim/zimit zimit --url URL --name myzimfile
|
||||
docker run -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile
|
||||
```
|
||||
|
||||
**Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`).
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
__version__ = "2.1.9-dev0"
|
||||
__version__ = "3.0.0-dev0"
|
||||
|
|
|
|||
|
|
@ -128,15 +128,21 @@ def run(raw_args):
|
|||
description="Run a browser-based crawl on the specified URL and convert to ZIM"
|
||||
)
|
||||
|
||||
parser.add_argument("-u", "--url", help="The URL to start crawling from")
|
||||
parser.add_argument(
|
||||
"--seeds",
|
||||
help="The seed URL(s) to start crawling from. Multile seed URL must be "
|
||||
"separated by a comma (usually not needed, these are just the crawl seeds). "
|
||||
"First seed URL is used as ZIM homepage",
|
||||
)
|
||||
|
||||
parser.add_argument("--title", help="ZIM title")
|
||||
parser.add_argument("--description", help="ZIM description")
|
||||
parser.add_argument("--title", help="WARC and ZIM title")
|
||||
parser.add_argument("--description", help="WARC and ZIM description")
|
||||
parser.add_argument("--long-description", help="ZIM long description metadata")
|
||||
|
||||
parser.add_argument(
|
||||
"--urlFile",
|
||||
help="If set, read a list of seed urls, one per line, from the specified",
|
||||
"--seedFile",
|
||||
help="If set, read a list of seed urls, one per line. Can be a local file or "
|
||||
"the HTTP(s) URL to an online file.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
|
@ -170,7 +176,7 @@ def run(raw_args):
|
|||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
"--pageLimit",
|
||||
help="Limit crawl to this number of pages. Default is 0 (no limit).",
|
||||
type=int,
|
||||
)
|
||||
|
|
@ -183,7 +189,7 @@ def run(raw_args):
|
|||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
"--pageLoadTimeout",
|
||||
help="Timeout for each page to load (in seconds). Default is 90 secs.",
|
||||
type=int,
|
||||
)
|
||||
|
|
@ -197,13 +203,13 @@ def run(raw_args):
|
|||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--include",
|
||||
"--scopeIncludeRx",
|
||||
help="Regex of page URLs that should be included in the crawl (defaults to "
|
||||
"the immediate directory of URL)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--exclude",
|
||||
"--scopeExcludeRx",
|
||||
help="Regex of page URLs that should be excluded from the crawl",
|
||||
)
|
||||
|
||||
|
|
@ -446,7 +452,7 @@ def run(raw_args):
|
|||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--delay",
|
||||
"--pageExtraDelay",
|
||||
help="If >0, amount of time to sleep (in seconds) after behaviors "
|
||||
"before moving on to next page. Default is 0.",
|
||||
type=int,
|
||||
|
|
@ -762,16 +768,40 @@ def run(raw_args):
|
|||
warc2zim_args.append("--output")
|
||||
warc2zim_args.append(zimit_args.output)
|
||||
|
||||
url = zimit_args.url
|
||||
|
||||
user_agent_suffix = zimit_args.userAgentSuffix
|
||||
if zimit_args.adminEmail:
|
||||
user_agent_suffix += f" {zimit_args.adminEmail}"
|
||||
|
||||
if url:
|
||||
url = get_cleaned_url(url)
|
||||
warc2zim_args.append("--url")
|
||||
warc2zim_args.append(url)
|
||||
# make temp dir for this crawl
|
||||
global temp_root_dir # noqa: PLW0603
|
||||
if zimit_args.build:
|
||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
|
||||
else:
|
||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
||||
|
||||
seeds = []
|
||||
if zimit_args.seeds:
|
||||
seeds += [get_cleaned_url(url) for url in zimit_args.seeds.split(",")]
|
||||
if zimit_args.seedFile:
|
||||
if re.match(r"^https?\://", zimit_args.seedFile):
|
||||
with tempfile.NamedTemporaryFile(
|
||||
dir=temp_root_dir,
|
||||
prefix="seeds_",
|
||||
suffix=".txt",
|
||||
delete_on_close=True,
|
||||
) as filename:
|
||||
seed_file = Path(filename.name)
|
||||
download_file(zimit_args.seedFile, seed_file)
|
||||
seeds += [
|
||||
get_cleaned_url(url) for url in seed_file.read_text().splitlines()
|
||||
]
|
||||
else:
|
||||
seeds += [
|
||||
get_cleaned_url(url)
|
||||
for url in Path(zimit_args.seedFile).read_text().splitlines()
|
||||
]
|
||||
warc2zim_args.append("--url")
|
||||
warc2zim_args.append(seeds[0])
|
||||
|
||||
if zimit_args.custom_css:
|
||||
warc2zim_args += ["--custom-css", zimit_args.custom_css]
|
||||
|
|
@ -800,13 +830,6 @@ def run(raw_args):
|
|||
logger.info("Exiting, invalid warc2zim params")
|
||||
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
|
||||
|
||||
# make temp dir for this crawl
|
||||
global temp_root_dir # noqa: PLW0603
|
||||
if zimit_args.build:
|
||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
|
||||
else:
|
||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
||||
|
||||
if not zimit_args.keep:
|
||||
atexit.register(cleanup)
|
||||
|
||||
|
|
@ -841,9 +864,9 @@ def run(raw_args):
|
|||
zimit_args.customBehaviors = None
|
||||
|
||||
cmd_args = get_node_cmd_line(zimit_args)
|
||||
if url:
|
||||
cmd_args.append("--url")
|
||||
cmd_args.append(url)
|
||||
for seed in seeds:
|
||||
cmd_args.append("--seeds")
|
||||
cmd_args.append(seed)
|
||||
|
||||
cmd_args.append("--userAgentSuffix")
|
||||
cmd_args.append(user_agent_suffix)
|
||||
|
|
@ -1032,18 +1055,17 @@ def get_node_cmd_line(args):
|
|||
for arg in [
|
||||
"title",
|
||||
"description",
|
||||
"urlFile",
|
||||
"workers",
|
||||
"crawlId",
|
||||
"waitUntil",
|
||||
"depth",
|
||||
"extraHops",
|
||||
"limit",
|
||||
"pageLimit",
|
||||
"maxPageLimit",
|
||||
"timeout",
|
||||
"pageLoadTimeout",
|
||||
"scopeType",
|
||||
"include",
|
||||
"exclude",
|
||||
"scopeIncludeRx",
|
||||
"scopeExcludeRx",
|
||||
"collection",
|
||||
"allowHashUrls",
|
||||
"selectLinks",
|
||||
|
|
@ -1074,7 +1096,7 @@ def get_node_cmd_line(args):
|
|||
"behaviors",
|
||||
"behaviorTimeout",
|
||||
"postLoadDelay",
|
||||
"delay",
|
||||
"pageExtraDelay",
|
||||
"dedupPolicy",
|
||||
"profile",
|
||||
"screenshot",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue