From b303af02efa976a42b8df12ba1e68fd86c7f04f3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 4 Apr 2023 10:46:03 -0400 Subject: [PATCH] Add --title and --description CLI args to write metadata into datapackage.json (#276) Multi-word values including spaces must be enclosed in double quotes. Co-authored-by: Ilya Kreymer --- README.md | 15 +++++++++++++-- crawler.js | 10 ++++++++++ docker-entrypoint.sh | 5 ++--- tests/basic_crawl.test.js | 10 +++++++++- util/argParser.js | 11 +++++++++++ 5 files changed, 45 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index fd6892dd..db013d0c 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Options: --crawlId, --id A user provided ID for this crawl or crawl configuration (can also be se t via CRAWL_ID env var) - [string] [default: "c69e2434da85"] + [string] [default: "454230b33b8f"] --newContext Deprecated as of 0.8.0, any values p assed will be ignored [string] [default: null] @@ -83,7 +83,7 @@ Options: [number] [default: 0] --pageLimit, --limit Limit crawl to this number of pages [number] [default: 0] - --maxPageLimit Maximum pages to crawl, overriding + --maxPageLimit Maximum pages to crawl, overriding pageLimit if both are set [number] [default: 0] --pageLoadTimeout, --timeout Timeout for each page to load (in se @@ -137,6 +137,12 @@ Options: lude: stats (enabled by default), js errors, pywb, debug [string] [default: "stats"] + --logLevel Comma-separated list of log levels t + o include in logs + [string] [default: ""] + --context Comma-separated list of contexts to + include in logs + [string] [default: ""] --text If set, extract text to the pages.js onl file [boolean] [default: false] --cwd Crawl working directory for captures @@ -229,6 +235,11 @@ Options: --lang if set, sets the language used by th e browser, should be ISO 639 languag e[-country] code [string] + --title If set, write supplied title into WA + CZ datapackage.json metadata[string] + --description, --desc If set, write supplied description i + nto WACZ datapackage.json metadata + [string] --config Path to YAML config file ``` diff --git a/crawler.js b/crawler.js index c4b357af..6528a2d5 100644 --- a/crawler.js +++ b/crawler.js @@ -841,6 +841,16 @@ export class Crawler { } } + if (this.params.title) { + createArgs.push("--title"); + createArgs.push(this.params.title); + } + + if (this.params.description) { + createArgs.push("--desc"); + createArgs.push(this.params.description); + } + createArgs.push("-f"); warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 895408d3..d8224bc7 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -20,9 +20,8 @@ if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then useradd -ms /bin/bash -g $VOLUME_GID btrix usermod -o -u $VOLUME_UID btrix > /dev/null - cmd="cd $PWD; $@" - su btrix -c "$cmd" + su btrix -c '"$@"' -- argv0-ignore "$@" else - exec $@ + exec "$@" fi diff --git a/tests/basic_crawl.test.js b/tests/basic_crawl.test.js index d7a20133..3f59b94d 100644 --- a/tests/basic_crawl.test.js +++ b/tests/basic_crawl.test.js @@ -7,7 +7,7 @@ import md5 from "md5"; test("ensure basic crawl run with docker run passes", async () => { - child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2"); + child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\""); child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz"); @@ -63,3 +63,11 @@ test("check that the hash in the pages folder and in the unzipped wacz folders m }); +test("check that the supplied title and description made it into datapackage.json", () => { + expect(fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json")).toBe(true); + + const data = fs.readFileSync("test-crawls/collections/wr-net/wacz/datapackage.json", "utf8"); + const dataPackageJSON = JSON.parse(data); + expect(dataPackageJSON.title).toEqual("test title"); + expect(dataPackageJSON.description).toEqual("test description"); +}); diff --git a/util/argParser.js b/util/argParser.js index b897f751..14e69739 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -349,6 +349,17 @@ class ArgParser { "lang": { describe: "if set, sets the language used by the browser, should be ISO 639 language[-country] code", type: "string" + }, + + "title": { + describe: "If set, write supplied title into WACZ datapackage.json metadata", + type: "string" + }, + + "description": { + alias: ["desc"], + describe: "If set, write supplied description into WACZ datapackage.json metadata", + type: "string" } }; }