mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check - Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded. - Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded. - Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted. - S3 Storage refactor, simplify, don't add additional paths by default. - Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value. - wacz save: reenable wacz validation after save. - Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs. - bump to 0.6.0-beta.1
This commit is contained in:
parent
500ed1f9a1
commit
93b6dad7b9
9 changed files with 281 additions and 69 deletions
39
README.md
39
README.md
|
@ -39,9 +39,13 @@ Here's how you can use some of the command-line options to configure the crawl:
|
|||
|
||||
- To limit the crawl to a maximum number of pages, add `--limit P` where P is the number of pages that will be crawled.
|
||||
|
||||
- To limit the crawl to a maximum size, set `--sizeLimit` (size in bytes)
|
||||
|
||||
- To limit the crawl time, set `--timeLimit` (in seconds)
|
||||
|
||||
- To run more than one browser worker and crawl in parallel, and `--workers N` where N is number of browsers to run in parallel. More browsers will require more CPU and network bandwidth, and does not guarantee faster crawling.
|
||||
|
||||
- To crawl into a new directory, specify a different name for the `--collection` param, or, if omitted, a new collection directory based on current time will be created.
|
||||
- To crawl into a new directory, specify a different name for the `--collection` param, or, if omitted, a new collection directory based on current time will be created. Adding the `--overwrite` flag will delete the collection directory at the start of the crawl, if it exists.
|
||||
|
||||
Browsertrix Crawler includes a number of additional command-line options, explained below.
|
||||
|
||||
|
@ -64,7 +68,7 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
--crawlId, --id A user provided ID for this crawl or
|
||||
crawl configuration (can also be set
|
||||
via CRAWL_ID env var)
|
||||
[string] [default: "4dd1535f7800"]
|
||||
[string] [default: <hostname> or CRAWL_ID env variable]
|
||||
--newContext The context for each new capture,
|
||||
can be a new: page, window, session
|
||||
or browser.
|
||||
|
@ -75,6 +79,9 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
[default: "load,networkidle2"]
|
||||
--depth The depth of the crawl for all seeds
|
||||
[number] [default: -1]
|
||||
--extraHops Number of extra 'hops' to follow,
|
||||
beyond the current scope
|
||||
[number] [default: 0]
|
||||
--limit Limit crawl to this number of pages
|
||||
[number] [default: 0]
|
||||
--timeout Timeout for each page to load (in
|
||||
|
@ -82,7 +89,8 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
--scopeType A predfined scope of the crawl. For
|
||||
more customization, use 'custom' and
|
||||
set scopeIncludeRx regexes
|
||||
[string] [choices: "page", "page-spa", "prefix", "host", "domain", "any", "custom"]
|
||||
[string] [choices: "page", "page-spa", "prefix", "host", "domain", "any",
|
||||
"custom"]
|
||||
--scopeIncludeRx, --include Regex of page URLs that should be
|
||||
included in the crawl (defaults to
|
||||
the immediate directory of URL)
|
||||
|
@ -103,7 +111,7 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
-c, --collection Collection name to crawl to (replay
|
||||
will be accessible under this name
|
||||
in pywb preview)
|
||||
[string] [default: "capture-YYYY-MM-DDThh:mm:ss"]
|
||||
[string] [default: "crawl-@ts"]
|
||||
--headless Run in headless mode, otherwise
|
||||
start xvfb[boolean] [default: false]
|
||||
--driver JS driver for the crawler
|
||||
|
@ -157,6 +165,10 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
an HTTP server with screencast
|
||||
accessible on this port
|
||||
[number] [default: 0]
|
||||
--screencastRedis If set, will use the state store
|
||||
redis pubsub for screencasting.
|
||||
Requires --redisStoreUrl to be set
|
||||
[boolean] [default: false]
|
||||
--warcInfo, --warcinfo Optional fields added to the
|
||||
warcinfo record in combined WARCs
|
||||
--redisStoreUrl If set, url for remote redis server
|
||||
|
@ -167,6 +179,25 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
Defaults to 'partial', only saved
|
||||
when crawl is interrupted
|
||||
[string] [choices: "never", "partial", "always"] [default: "partial"]
|
||||
--saveStateInterval If save state is set to 'always',
|
||||
also save state during the crawl at
|
||||
this interval (in seconds)
|
||||
[number] [default: 300]
|
||||
--saveStateHistory Number of save states to keep during
|
||||
the duration of a crawl
|
||||
[number] [default: 5]
|
||||
--sizeLimit If set, save state and exit if size
|
||||
limit exceeds this value
|
||||
[number] [default: 0]
|
||||
--timeLimit If set, save state and exit after
|
||||
time limit, in seconds
|
||||
[number] [default: 0]
|
||||
--healthCheckPort port to run healthcheck on
|
||||
[number] [default: 0]
|
||||
--overwrite overwrite current crawl data: if
|
||||
set, existing collection directory
|
||||
will be deleted before crawl is
|
||||
started [boolean] [default: false]
|
||||
--config Path to YAML config file
|
||||
```
|
||||
</details>
|
||||
|
|
118
crawler.js
118
crawler.js
|
@ -3,6 +3,8 @@ const path = require("path");
|
|||
const fs = require("fs");
|
||||
const os = require("os");
|
||||
const fsp = require("fs/promises");
|
||||
const http = require("http");
|
||||
const url = require("url");
|
||||
|
||||
// to ignore HTTPS error for HEAD check
|
||||
const HTTPS_AGENT = require("https").Agent({
|
||||
|
@ -25,7 +27,7 @@ const warcio = require("warcio");
|
|||
const behaviors = fs.readFileSync(path.join(__dirname, "node_modules", "browsertrix-behaviors", "dist", "behaviors.js"), {encoding: "utf8"});
|
||||
|
||||
const TextExtract = require("./util/textextract");
|
||||
const { initStorage, getFileSize } = require("./util/storage");
|
||||
const { initStorage, getFileSize, getDirSize, interpolateFilename } = require("./util/storage");
|
||||
const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/screencaster");
|
||||
const { parseArgs } = require("./util/argParser");
|
||||
const { initRedis } = require("./util/redis");
|
||||
|
@ -48,6 +50,10 @@ class Crawler {
|
|||
// pages file
|
||||
this.pagesFH = null;
|
||||
|
||||
this.crawlId = process.env.CRAWL_ID || os.hostname();
|
||||
|
||||
this.startTime = Date.now();
|
||||
|
||||
// was the limit hit?
|
||||
this.limitHit = false;
|
||||
|
||||
|
@ -89,6 +95,10 @@ class Crawler {
|
|||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
|
||||
this.blockRules = null;
|
||||
|
||||
this.errorCount = 0;
|
||||
|
||||
this.exitCode = 0;
|
||||
}
|
||||
|
||||
statusLog(...args) {
|
||||
|
@ -170,8 +180,7 @@ class Crawler {
|
|||
transport = new WSTransport(this.params.screencastPort);
|
||||
this.debugLog(`Screencast server started on: ${this.params.screencastPort}`);
|
||||
} else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
||||
const crawlId = process.env.CRAWL_ID || os.hostname();
|
||||
transport = new RedisPubSubTransport(this.params.redisStoreUrl, crawlId);
|
||||
transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
|
||||
this.debugLog("Screencast enabled via redis pubsub");
|
||||
}
|
||||
|
||||
|
@ -201,6 +210,15 @@ class Crawler {
|
|||
|
||||
subprocesses.push(child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}));
|
||||
|
||||
if (this.params.overwrite) {
|
||||
console.log(`Clearing ${this.collDir} before starting`);
|
||||
try {
|
||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||
} catch(e) {
|
||||
console.warn(e);
|
||||
}
|
||||
}
|
||||
|
||||
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
|
||||
|
||||
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
|
||||
|
@ -250,11 +268,11 @@ class Crawler {
|
|||
|
||||
try {
|
||||
await this.crawl();
|
||||
process.exit(0);
|
||||
process.exit(this.exitCode);
|
||||
} catch(e) {
|
||||
console.error("Crawl failed");
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
process.exit(9);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -317,6 +335,8 @@ class Crawler {
|
|||
|
||||
await this.writeStats();
|
||||
|
||||
await this.checkLimits();
|
||||
|
||||
await this.serializeConfig();
|
||||
|
||||
} catch (e) {
|
||||
|
@ -351,9 +371,61 @@ class Crawler {
|
|||
return buffer;
|
||||
}
|
||||
|
||||
async healthCheck(req, res) {
|
||||
const threshold = this.params.workers * 2;
|
||||
const pathname = url.parse(req.url).pathname;
|
||||
switch (pathname) {
|
||||
case "/healthz":
|
||||
if (this.errorCount < threshold) {
|
||||
console.log(`health check ok, num errors ${this.errorCount} < ${threshold}`);
|
||||
res.writeHead(200);
|
||||
res.end();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`health check failed: ${this.errorCount} >= ${threshold}`);
|
||||
res.writeHead(503);
|
||||
res.end();
|
||||
}
|
||||
|
||||
async checkLimits() {
|
||||
let interrupt = false;
|
||||
|
||||
if (this.params.sizeLimit) {
|
||||
const dir = path.join(this.collDir, "archive");
|
||||
|
||||
const size = await getDirSize(dir);
|
||||
|
||||
if (size >= this.params.sizeLimit) {
|
||||
console.log(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
|
||||
interrupt = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.params.timeLimit) {
|
||||
const elapsed = (Date.now() - this.startTime) / 1000;
|
||||
if (elapsed > this.params.timeLimit) {
|
||||
console.log(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
|
||||
interrupt = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (interrupt) {
|
||||
this.crawlState.setDrain();
|
||||
this.exitCode = 11;
|
||||
}
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
this.profileDir = await loadProfile(this.params.profile);
|
||||
|
||||
if (this.params.healthCheckPort) {
|
||||
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
|
||||
this.statusLog(`Healthcheck server started on ${this.params.healthCheckPort}`);
|
||||
this.healthServer.listen(this.params.healthCheckPort);
|
||||
}
|
||||
|
||||
try {
|
||||
this.driver = require(this.params.driver);
|
||||
} catch(e) {
|
||||
|
@ -362,7 +434,7 @@ class Crawler {
|
|||
}
|
||||
|
||||
if (this.params.generateWACZ) {
|
||||
this.storage = initStorage("data/");
|
||||
this.storage = initStorage();
|
||||
}
|
||||
|
||||
// Puppeteer Cluster init and options
|
||||
|
@ -428,7 +500,7 @@ class Crawler {
|
|||
if (this.params.generateCDX) {
|
||||
this.statusLog("Generating CDX");
|
||||
|
||||
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
||||
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd}));
|
||||
}
|
||||
|
||||
if (this.params.generateWACZ) {
|
||||
|
@ -471,9 +543,9 @@ class Crawler {
|
|||
warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
||||
|
||||
// create WACZ
|
||||
const waczResult = child_process.spawnSync("wacz" , createArgs, {stdio: "inherit"});
|
||||
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs, {stdio: "inherit"}));
|
||||
|
||||
if (waczResult.status !== 0) {
|
||||
if (waczResult !== 0) {
|
||||
console.log("create result", waczResult);
|
||||
throw new Error("Unable to write WACZ successfully");
|
||||
}
|
||||
|
@ -483,19 +555,28 @@ class Crawler {
|
|||
// Verify WACZ
|
||||
validateArgs.push(waczPath);
|
||||
|
||||
const waczVerifyResult = child_process.spawnSync("wacz", validateArgs, {stdio: "inherit"});
|
||||
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs, {stdio: "inherit"}));
|
||||
|
||||
if (waczVerifyResult.status !== 0) {
|
||||
if (waczVerifyResult !== 0) {
|
||||
console.log("validate", waczVerifyResult);
|
||||
throw new Error("Unable to verify WACZ created successfully");
|
||||
}
|
||||
|
||||
if (this.storage) {
|
||||
const finished = await this.crawlState.finished();
|
||||
await this.storage.uploadCollWACZ(waczPath, finished);
|
||||
const filename = process.env.STORE_FILENAME || "@ts-@id.wacz";
|
||||
const targetFilename = interpolateFilename(filename, this.crawlId);
|
||||
|
||||
await this.storage.uploadCollWACZ(waczPath, targetFilename, finished);
|
||||
}
|
||||
}
|
||||
|
||||
awaitProcess(proc) {
|
||||
return new Promise((resolve) => {
|
||||
proc.on("close", (code) => resolve(code));
|
||||
});
|
||||
}
|
||||
|
||||
async writeStats() {
|
||||
if (this.params.statsFilename) {
|
||||
const total = this.cluster.allTargetCount;
|
||||
|
@ -543,12 +624,13 @@ class Crawler {
|
|||
const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded";
|
||||
|
||||
try {
|
||||
//await Promise.race([page.goto(url, this.gotoOpts), nonHTMLLoad]);
|
||||
await page.goto(url, gotoOpts);
|
||||
this.errorCount = 0;
|
||||
} catch (e) {
|
||||
let msg = e.message || "";
|
||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||
this.statusLog(`ERROR: ${url}: ${msg}`);
|
||||
this.errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -924,7 +1006,9 @@ class Crawler {
|
|||
|
||||
await fsp.mkdir(crawlDir, {recursive: true});
|
||||
|
||||
const filename = path.join(crawlDir, `crawl-${ts}-${this.params.crawlId}.yaml`);
|
||||
const filenameOnly = `crawl-${ts}-${this.params.crawlId}.yaml`;
|
||||
|
||||
const filename = path.join(crawlDir, filenameOnly);
|
||||
|
||||
const state = await this.crawlState.serialize();
|
||||
|
||||
|
@ -951,6 +1035,12 @@ class Crawler {
|
|||
console.error(`Failed to delete old save state file: ${oldFilename}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (this.storage && done && this.params.saveState === "always") {
|
||||
const targetFilename = interpolateFilename(filenameOnly, this.crawlId);
|
||||
|
||||
await this.storage.uploadFile(filename, targetFilename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -210,7 +210,7 @@ async function createProfile(params, browser, page, targetFilename = "") {
|
|||
|
||||
let resource = {};
|
||||
|
||||
const storage = initStorage("profiles/");
|
||||
const storage = initStorage();
|
||||
if (storage) {
|
||||
console.log("Uploading to remote storage...");
|
||||
resource = await storage.uploadFile(profileFilename, targetFilename);
|
||||
|
@ -269,6 +269,16 @@ class InteractiveBrowser {
|
|||
|
||||
page.on("load", () => this.addOrigin());
|
||||
|
||||
page.on("popup", async () => {
|
||||
await this.page._client.send("Target.activateTarget", {targetId: this.targetId});
|
||||
});
|
||||
|
||||
page._client.on("Page.windowOpen", async (resp) => {
|
||||
if (resp.url) {
|
||||
await page.goto(resp.url);
|
||||
}
|
||||
});
|
||||
|
||||
this.shutdownWait = params.shutdownWait * 1000;
|
||||
|
||||
if (this.shutdownWait) {
|
||||
|
@ -296,6 +306,7 @@ class InteractiveBrowser {
|
|||
const parsedUrl = new URL(req.url, `http://${req.headers.host}`);
|
||||
const pathname = parsedUrl.pathname;
|
||||
let targetUrl;
|
||||
let origins;
|
||||
|
||||
switch (pathname) {
|
||||
case "/":
|
||||
|
@ -310,8 +321,12 @@ class InteractiveBrowser {
|
|||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||
console.log(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
|
||||
}
|
||||
|
||||
origins = Array.from(this.originSet.values());
|
||||
|
||||
res.writeHead(200, {"Content-Type": "application/json"});
|
||||
res.end(JSON.stringify({"pong": true}));
|
||||
|
||||
res.end(JSON.stringify({pong: true, origins}));
|
||||
return;
|
||||
|
||||
case "/target":
|
||||
|
@ -319,35 +334,37 @@ class InteractiveBrowser {
|
|||
res.end(JSON.stringify({targetId: this.targetId}));
|
||||
return;
|
||||
|
||||
case "/navigate":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const postData = await this.readBodyJson(req);
|
||||
const url = new URL(postData.url).href;
|
||||
|
||||
res.writeHead(200, {"Content-Type": "application/json"});
|
||||
res.end(JSON.stringify({success: true}));
|
||||
|
||||
this.page.goto(url);
|
||||
|
||||
} catch (e) {
|
||||
res.writeHead(400, {"Content-Type": "application/json"});
|
||||
res.end(JSON.stringify({"error": e.toString()}));
|
||||
console.log(e);
|
||||
}
|
||||
return;
|
||||
|
||||
case "/createProfileJS":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
const buffers = [];
|
||||
|
||||
for await (const chunk of req) {
|
||||
buffers.push(chunk);
|
||||
}
|
||||
|
||||
const data = Buffer.concat(buffers).toString();
|
||||
|
||||
let targetFilename = "";
|
||||
|
||||
if (data.length) {
|
||||
try {
|
||||
targetFilename = JSON.parse(data).filename;
|
||||
} catch (e) {
|
||||
targetFilename = "";
|
||||
}
|
||||
}
|
||||
|
||||
console.log("target filename", targetFilename);
|
||||
|
||||
const postData = await this.readBodyJson(req);
|
||||
const targetFilename = postData.filename || "";
|
||||
const resource = await createProfile(this.params, this.browser, this.page, targetFilename);
|
||||
const origins = Array.from(this.originSet.values());
|
||||
origins = Array.from(this.originSet.values());
|
||||
|
||||
res.writeHead(200, {"Content-Type": "application/json"});
|
||||
res.end(JSON.stringify({resource, origins}));
|
||||
|
@ -383,6 +400,24 @@ class InteractiveBrowser {
|
|||
res.writeHead(404, {"Content-Type": "text/html"});
|
||||
res.end("Not Found");
|
||||
}
|
||||
|
||||
async readBodyJson(req) {
|
||||
const buffers = [];
|
||||
|
||||
for await (const chunk of req) {
|
||||
buffers.push(chunk);
|
||||
}
|
||||
|
||||
const data = Buffer.concat(buffers).toString();
|
||||
|
||||
if (data.length) {
|
||||
try {
|
||||
return JSON.parse(data) || {};
|
||||
} catch (e) {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.6.0-beta.0",
|
||||
"version": "0.6.0-beta.1",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
@ -11,6 +11,7 @@
|
|||
"dependencies": {
|
||||
"abort-controller": "^3.0.0",
|
||||
"browsertrix-behaviors": "^0.3.0",
|
||||
"get-folder-size": "2",
|
||||
"ioredis": "^4.27.1",
|
||||
"js-yaml": "^4.1.0",
|
||||
"minio": "7.0.26",
|
||||
|
|
|
@ -11,6 +11,7 @@ const { hideBin } = require("yargs/helpers");
|
|||
const { NewWindowPage} = require("./screencaster");
|
||||
const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
|
||||
const { ScopedSeed } = require("./seeds");
|
||||
const { interpolateFilename } = require("./storage");
|
||||
|
||||
|
||||
// ============================================================================
|
||||
|
@ -114,7 +115,7 @@ class ArgParser {
|
|||
alias: "c",
|
||||
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
type: "string",
|
||||
default: `capture-${new Date().toISOString().slice(0,19)}`.replace(/:/g, "-")
|
||||
default: "crawl-@ts"
|
||||
},
|
||||
|
||||
"headless": {
|
||||
|
@ -255,7 +256,31 @@ class ArgParser {
|
|||
describe: "Number of save states to keep during the duration of a crawl",
|
||||
type: "number",
|
||||
default: 5,
|
||||
}
|
||||
},
|
||||
|
||||
"sizeLimit": {
|
||||
describe: "If set, save state and exit if size limit exceeds this value",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"timeLimit": {
|
||||
describe: "If set, save state and exit after time limit, in seconds",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"healthCheckPort": {
|
||||
describe: "port to run healthcheck on",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"overwrite": {
|
||||
describe: "overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
|
||||
type: "boolean",
|
||||
default: false
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -286,6 +311,8 @@ class ArgParser {
|
|||
|
||||
|
||||
validateArgs(argv) {
|
||||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||
|
||||
// Check that the collection name is valid.
|
||||
if (argv.collection.search(/^[\w][\w-]*$/) === -1){
|
||||
throw new Error(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
|
||||
|
|
|
@ -91,8 +91,9 @@ module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
|
|||
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||
"--no-sandbox",
|
||||
"--disable-background-media-suspend",
|
||||
"--enable-features=NetworkService,NetworkServiceInProcess",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-features=Translate,LazyFrameLoading,IsolateOrigins,site-per-process",
|
||||
"--disable-features=IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,AcceptCHFrame,AutoExpandDetailsElement",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
`--user-agent=${userAgent || getDefaultUA()}`,
|
||||
|
|
|
@ -274,7 +274,11 @@ return 0;
|
|||
data = JSON.parse(json);
|
||||
} catch(e) {
|
||||
console.error("Invalid queued json: ", json);
|
||||
return;
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const url = data.url;
|
||||
|
|
|
@ -9,11 +9,14 @@ const Minio = require("minio");
|
|||
|
||||
const { initRedis } = require("./redis");
|
||||
|
||||
const util = require("util");
|
||||
const getFolderSize = util.promisify(require("get-folder-size"));
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class S3StorageSync
|
||||
{
|
||||
constructor(urlOrData, {filename, webhookUrl, userId, crawlId, prefix = ""} = {}) {
|
||||
constructor(urlOrData, {webhookUrl, userId, crawlId} = {}) {
|
||||
let url;
|
||||
let accessKey;
|
||||
let secretKey;
|
||||
|
@ -53,23 +56,14 @@ class S3StorageSync
|
|||
this.userId = userId;
|
||||
this.crawlId = crawlId;
|
||||
this.webhookUrl = webhookUrl;
|
||||
|
||||
this.filenamePrefix = prefix;
|
||||
|
||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.]/g, ""));
|
||||
filename = filename.replace("@hostname", os.hostname());
|
||||
filename = filename.replace("@id", this.crawlId);
|
||||
|
||||
this.targetFilename = this.filenamePrefix + filename;
|
||||
}
|
||||
|
||||
async uploadFile(srcFilename, targetFilename) {
|
||||
// allow overriding targetFilename
|
||||
if (targetFilename) {
|
||||
targetFilename = this.filenamePrefix + targetFilename;
|
||||
} else {
|
||||
targetFilename = this.targetFilename;
|
||||
}
|
||||
console.log(`Bucket: ${this.bucketName}`);
|
||||
console.log(`Crawl Id: ${this.crawlId}`);
|
||||
console.log(`Prefix: ${this.objectPrefix}`);
|
||||
console.log(`Target Filename: ${targetFilename}`);
|
||||
|
||||
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
|
||||
|
||||
const finalHash = await checksumFile("sha256", srcFilename);
|
||||
|
@ -82,8 +76,8 @@ class S3StorageSync
|
|||
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
||||
}
|
||||
|
||||
async uploadCollWACZ(srcFilename, completed = true) {
|
||||
const resource = await this.uploadFile(srcFilename, this.targetFilename);
|
||||
async uploadCollWACZ(srcFilename, targetFilename, completed = true) {
|
||||
const resource = await this.uploadFile(srcFilename, targetFilename);
|
||||
console.log(resource);
|
||||
|
||||
if (this.webhookUrl) {
|
||||
|
@ -92,7 +86,7 @@ class S3StorageSync
|
|||
user: this.userId,
|
||||
|
||||
//filename: `s3://${this.bucketName}/${this.objectPrefix}${this.waczFilename}`,
|
||||
filename: this.fullPrefix + this.targetFilename,
|
||||
filename: this.fullPrefix + targetFilename,
|
||||
|
||||
hash: resource.hash,
|
||||
size: resource.bytes,
|
||||
|
@ -116,7 +110,7 @@ class S3StorageSync
|
|||
}
|
||||
}
|
||||
|
||||
function initStorage(prefix = "") {
|
||||
function initStorage() {
|
||||
if (!process.env.STORE_ENDPOINT_URL) {
|
||||
return null;
|
||||
}
|
||||
|
@ -132,8 +126,6 @@ function initStorage(prefix = "") {
|
|||
crawlId: process.env.CRAWL_ID || os.hostname(),
|
||||
webhookUrl: process.env.WEBHOOK_URL,
|
||||
userId: process.env.STORE_USER,
|
||||
prefix,
|
||||
filename: process.env.STORE_FILENAME || "@ts-@id.wacz",
|
||||
};
|
||||
|
||||
console.log("Initing Storage...");
|
||||
|
@ -146,6 +138,10 @@ async function getFileSize(filename) {
|
|||
return stats.size;
|
||||
}
|
||||
|
||||
async function getDirSize(dir) {
|
||||
return await getFolderSize(dir);
|
||||
}
|
||||
|
||||
function checksumFile(hashName, path) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const hash = createHash(hashName);
|
||||
|
@ -156,7 +152,16 @@ function checksumFile(hashName, path) {
|
|||
});
|
||||
}
|
||||
|
||||
function interpolateFilename(filename, crawlId) {
|
||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
||||
filename = filename.replace("@hostname", os.hostname());
|
||||
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
||||
filename = filename.replace("@id", crawlId);
|
||||
return filename;
|
||||
}
|
||||
|
||||
module.exports.S3StorageSync = S3StorageSync;
|
||||
module.exports.getFileSize = getFileSize;
|
||||
module.exports.getDirSize = getDirSize;
|
||||
module.exports.initStorage = initStorage;
|
||||
|
||||
module.exports.interpolateFilename = interpolateFilename;
|
||||
|
|
18
yarn.lock
18
yarn.lock
|
@ -2247,6 +2247,11 @@ functional-red-black-tree@^1.0.1:
|
|||
resolved "https://registry.yarnpkg.com/functional-red-black-tree/-/functional-red-black-tree-1.0.1.tgz#1b0ab3bd553b2a0d6399d29c0e3ea0b252078327"
|
||||
integrity sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=
|
||||
|
||||
gar@^1.0.4:
|
||||
version "1.0.4"
|
||||
resolved "https://registry.yarnpkg.com/gar/-/gar-1.0.4.tgz#f777bc7db425c0572fdeb52676172ca1ae9888b8"
|
||||
integrity sha512-w4n9cPWyP7aHxKxYHFQMegj7WIAsL/YX/C4Bs5Rr8s1H9M1rNtRWRsw+ovYMkXDQ5S4ZbYHsHAPmevPjPgw44w==
|
||||
|
||||
gensync@^1.0.0-beta.2:
|
||||
version "1.0.0-beta.2"
|
||||
resolved "https://registry.yarnpkg.com/gensync/-/gensync-1.0.0-beta.2.tgz#32a6ee76c3d7f52d46b2b1ae5d93fea8580a25e0"
|
||||
|
@ -2257,6 +2262,14 @@ get-caller-file@^2.0.1, get-caller-file@^2.0.5:
|
|||
resolved "https://registry.yarnpkg.com/get-caller-file/-/get-caller-file-2.0.5.tgz#4f94412a82db32f36e3b0b9741f8a97feb031f7e"
|
||||
integrity sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==
|
||||
|
||||
get-folder-size@2:
|
||||
version "2.0.1"
|
||||
resolved "https://registry.yarnpkg.com/get-folder-size/-/get-folder-size-2.0.1.tgz#3fe0524dd3bad05257ef1311331417bcd020a497"
|
||||
integrity sha512-+CEb+GDCM7tkOS2wdMKTn9vU7DgnKUTuDlehkNJKNSovdCOVxs14OfKCk4cvSaR3za4gj+OBdl9opPN9xrJ0zA==
|
||||
dependencies:
|
||||
gar "^1.0.4"
|
||||
tiny-each-async "2.0.3"
|
||||
|
||||
get-intrinsic@^1.0.2, get-intrinsic@^1.1.0, get-intrinsic@^1.1.1:
|
||||
version "1.1.1"
|
||||
resolved "https://registry.yarnpkg.com/get-intrinsic/-/get-intrinsic-1.1.1.tgz#15f59f376f855c446963948f0d24cd3637b4abc6"
|
||||
|
@ -5002,6 +5015,11 @@ through@^2.3.8:
|
|||
resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5"
|
||||
integrity sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=
|
||||
|
||||
tiny-each-async@2.0.3:
|
||||
version "2.0.3"
|
||||
resolved "https://registry.yarnpkg.com/tiny-each-async/-/tiny-each-async-2.0.3.tgz#8ebbbfd6d6295f1370003fbb37162afe5a0a51d1"
|
||||
integrity sha1-jru/1tYpXxNwAD+7NxYq/loKUdE=
|
||||
|
||||
tmpl@1.0.x:
|
||||
version "1.0.4"
|
||||
resolved "https://registry.yarnpkg.com/tmpl/-/tmpl-1.0.4.tgz#23640dd7b42d00433911140820e5cf440e521dd1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue