mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add --title and --description CLI args to write metadata into datapackage.json (#276)
Multi-word values including spaces must be enclosed in double quotes. Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
parent
d4233582bb
commit
b303af02ef
5 changed files with 45 additions and 6 deletions
15
README.md
15
README.md
|
@ -69,7 +69,7 @@ Options:
|
|||
--crawlId, --id A user provided ID for this crawl or
|
||||
crawl configuration (can also be se
|
||||
t via CRAWL_ID env var)
|
||||
[string] [default: "c69e2434da85"]
|
||||
[string] [default: "454230b33b8f"]
|
||||
--newContext Deprecated as of 0.8.0, any values p
|
||||
assed will be ignored
|
||||
[string] [default: null]
|
||||
|
@ -83,7 +83,7 @@ Options:
|
|||
[number] [default: 0]
|
||||
--pageLimit, --limit Limit crawl to this number of pages
|
||||
[number] [default: 0]
|
||||
--maxPageLimit Maximum pages to crawl, overriding
|
||||
--maxPageLimit Maximum pages to crawl, overriding
|
||||
pageLimit if both are set
|
||||
[number] [default: 0]
|
||||
--pageLoadTimeout, --timeout Timeout for each page to load (in se
|
||||
|
@ -137,6 +137,12 @@ Options:
|
|||
lude: stats (enabled by default), js
|
||||
errors, pywb, debug
|
||||
[string] [default: "stats"]
|
||||
--logLevel Comma-separated list of log levels t
|
||||
o include in logs
|
||||
[string] [default: ""]
|
||||
--context Comma-separated list of contexts to
|
||||
include in logs
|
||||
[string] [default: ""]
|
||||
--text If set, extract text to the pages.js
|
||||
onl file [boolean] [default: false]
|
||||
--cwd Crawl working directory for captures
|
||||
|
@ -229,6 +235,11 @@ Options:
|
|||
--lang if set, sets the language used by th
|
||||
e browser, should be ISO 639 languag
|
||||
e[-country] code [string]
|
||||
--title If set, write supplied title into WA
|
||||
CZ datapackage.json metadata[string]
|
||||
--description, --desc If set, write supplied description i
|
||||
nto WACZ datapackage.json metadata
|
||||
[string]
|
||||
--config Path to YAML config file
|
||||
|
||||
```
|
||||
|
|
10
crawler.js
10
crawler.js
|
@ -841,6 +841,16 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
if (this.params.title) {
|
||||
createArgs.push("--title");
|
||||
createArgs.push(this.params.title);
|
||||
}
|
||||
|
||||
if (this.params.description) {
|
||||
createArgs.push("--desc");
|
||||
createArgs.push(this.params.description);
|
||||
}
|
||||
|
||||
createArgs.push("-f");
|
||||
|
||||
warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
||||
|
|
|
@ -20,9 +20,8 @@ if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then
|
|||
useradd -ms /bin/bash -g $VOLUME_GID btrix
|
||||
usermod -o -u $VOLUME_UID btrix > /dev/null
|
||||
|
||||
cmd="cd $PWD; $@"
|
||||
su btrix -c "$cmd"
|
||||
su btrix -c '"$@"' -- argv0-ignore "$@"
|
||||
else
|
||||
exec $@
|
||||
exec "$@"
|
||||
fi
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ import md5 from "md5";
|
|||
|
||||
|
||||
test("ensure basic crawl run with docker run passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2");
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\"");
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz");
|
||||
|
||||
|
@ -63,3 +63,11 @@ test("check that the hash in the pages folder and in the unzipped wacz folders m
|
|||
|
||||
});
|
||||
|
||||
test("check that the supplied title and description made it into datapackage.json", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json")).toBe(true);
|
||||
|
||||
const data = fs.readFileSync("test-crawls/collections/wr-net/wacz/datapackage.json", "utf8");
|
||||
const dataPackageJSON = JSON.parse(data);
|
||||
expect(dataPackageJSON.title).toEqual("test title");
|
||||
expect(dataPackageJSON.description).toEqual("test description");
|
||||
});
|
||||
|
|
|
@ -349,6 +349,17 @@ class ArgParser {
|
|||
"lang": {
|
||||
describe: "if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||
type: "string"
|
||||
},
|
||||
|
||||
"title": {
|
||||
describe: "If set, write supplied title into WACZ datapackage.json metadata",
|
||||
type: "string"
|
||||
},
|
||||
|
||||
"description": {
|
||||
alias: ["desc"],
|
||||
describe: "If set, write supplied description into WACZ datapackage.json metadata",
|
||||
type: "string"
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue