diff --git a/crawler.js b/crawler.js index 8e6319da..e3ed70af 100644 --- a/crawler.js +++ b/crawler.js @@ -107,14 +107,20 @@ class Crawler { } bootstrap() { - const opts = {stdio: "ignore", cwd: this.params.cwd}; + let opts = {} + if (this.params.pywb_log) { + opts = {stdio: "inherit", cwd: this.params.cwd}; + } + else{ + opts = {stdio: "ignore", cwd: this.params.cwd}; + } this.configureUA(); this.headers = {"User-Agent": this.userAgent}; child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}); - + child_process.spawnSync("wb-manager", ["init", this.params.collection], opts); opts.env = {...process.env, COLL: this.params.collection}; @@ -222,6 +228,12 @@ class Crawler { default: false, }, + "pywb-log": { + describe: "If set, generate pywb log file", + type: "boolean", + default: false, + }, + "text": { describe: "If set, extract text to the pages.jsonl file", type: "boolean", @@ -420,7 +432,7 @@ class Crawler { if (this.params.text){ const client = await page.target().createCDPSession(); const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true}); - text = await new TextExtract(result).parseTextFromDom() + text = await new TextExtract(result).parseTextFromDom(); } this.writePage(data.url, title, this.params.text, text); @@ -717,4 +729,3 @@ class Crawler { } module.exports.Crawler = Crawler; -