Implement improved json-l logging

- Add Logger class with methods for info, error, warn, debug, fatal
- Add context, timestamp, and details fields to log entries
- Log messages as JSON Lines
- Replace puppeteer-cluster stats with custom stats implementation
- Log behaviors by default
- Amend argParser to reflect logging changes
- Capture and log stdout/stderr from awaited child_processes
- Modify tests to use webrecorder.net to avoid timeouts
This commit is contained in:
Tessa Walsh 2022-12-15 12:38:41 -05:00
parent 2b03e23174
commit 0192d05f4c
16 changed files with 276 additions and 194 deletions

View file

@ -23,6 +23,7 @@ import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screenca
import { Screenshots } from "./util/screenshots.js"; import { Screenshots } from "./util/screenshots.js";
import { parseArgs } from "./util/argParser.js"; import { parseArgs } from "./util/argParser.js";
import { initRedis } from "./util/redis.js"; import { initRedis } from "./util/redis.js";
import { Logger } from "./util/logger.js";
import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } from "./util/browser.js"; import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } from "./util/browser.js";
@ -46,6 +47,13 @@ const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/
// ============================================================================ // ============================================================================
export class Crawler { export class Crawler {
constructor() { constructor() {
const res = parseArgs();
this.params = res.parsed;
this.origConfig = res.origConfig;
const debugLogging = this.params.logging.includes("debug");
this.logger = new Logger(debugLogging);
this.headers = {}; this.headers = {};
this.crawlState = null; this.crawlState = null;
@ -63,23 +71,17 @@ export class Crawler {
this.userAgent = ""; this.userAgent = "";
const res = parseArgs();
this.params = res.parsed;
this.origConfig = res.origConfig;
this.saveStateFiles = []; this.saveStateFiles = [];
this.lastSaveTime = 0; this.lastSaveTime = 0;
this.saveStateInterval = this.params.saveStateInterval * 1000; this.saveStateInterval = this.params.saveStateInterval * 1000;
this.debugLogging = this.params.logging.includes("debug");
if (this.params.profile) { if (this.params.profile) {
this.statusLog("With Browser Profile: " + this.params.profile); this.logger.info(`With Browser Profile: ${this.params.profile}"`);
} }
this.emulateDevice = this.params.emulateDevice; this.emulateDevice = this.params.emulateDevice;
this.debugLog("Seeds", this.params.scopedSeeds); this.logger.info("Seeds", this.params.scopedSeeds);
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`; this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/"; this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
@ -110,18 +112,6 @@ export class Crawler {
this.done = false; this.done = false;
this.behaviorLastLine = null; this.behaviorLastLine = null;
this.logConsole = false;
}
statusLog(...args) {
console.log(...args);
}
debugLog(...args) {
if (this.debugLogging) {
console.log(...args);
}
} }
configureUA() { configureUA() {
@ -158,7 +148,7 @@ export class Crawler {
if (redisUrl) { if (redisUrl) {
if (!redisUrl.startsWith("redis://")) { if (!redisUrl.startsWith("redis://")) {
throw new Error("stateStoreUrl must start with redis:// -- Only redis-based store currently supported"); this.logger.fatal("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
} }
let redis; let redis;
@ -168,24 +158,24 @@ export class Crawler {
redis = await initRedis(redisUrl); redis = await initRedis(redisUrl);
break; break;
} catch (e) { } catch (e) {
//throw new Error("Unable to connect to state store Redis: " + redisUrl); //this.logger.fatal("Unable to connect to state store Redis: " + redisUrl);
console.warn(`Waiting for redis at ${redisUrl}`); this.logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
await this.sleep(3); await this.sleep(3);
} }
} }
this.statusLog(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`); this.logger.info(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname()); this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname());
} else { } else {
this.statusLog("Storing state in memory"); this.logger.info("Storing state in memory", {}, "state");
this.crawlState = new MemoryCrawlState(); this.crawlState = new MemoryCrawlState();
} }
if (this.params.saveState === "always" && this.params.saveStateInterval) { if (this.params.saveState === "always" && this.params.saveStateInterval) {
this.statusLog(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`); this.logger.info(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
} }
return this.crawlState; return this.crawlState;
@ -196,10 +186,10 @@ export class Crawler {
if (this.params.screencastPort) { if (this.params.screencastPort) {
transport = new WSTransport(this.params.screencastPort); transport = new WSTransport(this.params.screencastPort);
this.debugLog(`Screencast server started on: ${this.params.screencastPort}`); this.logger.debug(`Screencast server started on: ${this.params.screencastPort}`, {}, "screencast");
} else if (this.params.redisStoreUrl && this.params.screencastRedis) { } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId); transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
this.debugLog("Screencast enabled via redis pubsub"); this.logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
} }
if (!transport) { if (!transport) {
@ -215,7 +205,7 @@ export class Crawler {
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd}); const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
if (initRes.status) { if (initRes.status) {
console.log("wb-manager init failed, collection likely already exists"); this.logger.info("wb-manager init failed, collection likely already exists");
} }
await fsp.mkdir(logs, {recursive: true}); await fsp.mkdir(logs, {recursive: true});
@ -236,8 +226,6 @@ export class Crawler {
redisStdio = "ignore"; redisStdio = "ignore";
} }
this.logConsole = this.params.logging.includes("jserrors");
this.browserExe = getBrowserExe(); this.browserExe = getBrowserExe();
this.configureUA(); this.configureUA();
@ -249,11 +237,11 @@ export class Crawler {
subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio})); subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio}));
if (this.params.overwrite) { if (this.params.overwrite) {
console.log(`Clearing ${this.collDir} before starting`); this.logger.info(`Clearing ${this.collDir} before starting`);
try { try {
fs.rmSync(this.collDir, { recursive: true, force: true }); fs.rmSync(this.collDir, { recursive: true, force: true });
} catch(e) { } catch(e) {
console.warn(e); this.logger.error(`Unable to clear ${this.collDir}`, e);
} }
} }
@ -318,8 +306,7 @@ export class Crawler {
await this.crawl(); await this.crawl();
status = (!this.interrupted ? "done" : "interrupted"); status = (!this.interrupted ? "done" : "interrupted");
} catch(e) { } catch(e) {
console.error("Crawl failed"); this.logger.error("Crawl failed", e);
console.error(e);
exitCode = 9; exitCode = 9;
status = "failing"; status = "failing";
if (await this.crawlState.incFailCount()) { if (await this.crawlState.incFailCount()) {
@ -327,7 +314,7 @@ export class Crawler {
} }
} finally { } finally {
console.log(status); this.logger.info(`Crawl status: ${status}`);
if (this.crawlState) { if (this.crawlState) {
await this.crawlState.setStatus(status); await this.crawlState.setStatus(status);
@ -344,16 +331,14 @@ export class Crawler {
case "info": case "info":
behaviorLine = JSON.stringify(data); behaviorLine = JSON.stringify(data);
if (behaviorLine != this._behaviorLastLine) { if (behaviorLine != this._behaviorLastLine) {
console.log(behaviorLine); this.logger.info("Behavior line", data, "behavior");
this._behaviorLastLine = behaviorLine; this._behaviorLastLine = behaviorLine;
} }
break; break;
case "debug": case "debug":
default: default:
if (this.params.behaviorsLogDebug) { this.logger.debug("Behavior debug", data, "behavior");
console.log("behavior debug: " + JSON.stringify(data));
}
} }
} }
@ -367,7 +352,7 @@ export class Crawler {
const {page, data} = opts; const {page, data} = opts;
if (!this.isInScope(data)) { if (!this.isInScope(data)) {
console.log(`No longer in scope: ${JSON.stringify(data)}`); this.logger.info("Page no longer in scope", data);
return; return;
} }
@ -425,7 +410,7 @@ export class Crawler {
if (this.params.behaviorOpts) { if (this.params.behaviorOpts) {
if (!page.isHTMLPage) { if (!page.isHTMLPage) {
console.log("Skipping behaviors for non-HTML page"); this.logger.info("Skipping behaviors for non-HTML page", data, "behavior");
} else { } else {
await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();"))); await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")));
@ -434,6 +419,8 @@ export class Crawler {
} }
} }
this.logger.info("Page graph data for successfully crawled page", data, "pageGraph");
await this.writeStats(); await this.writeStats();
await this.checkLimits(); await this.checkLimits();
@ -441,7 +428,7 @@ export class Crawler {
await this.serializeConfig(); await this.serializeConfig();
} catch (e) { } catch (e) {
console.warn(e); this.logger.error(`Error crawling page ${data.url}`, e.message);
await this.markPageFailed(page); await this.markPageFailed(page);
} }
} }
@ -470,14 +457,14 @@ export class Crawler {
switch (pathname) { switch (pathname) {
case "/healthz": case "/healthz":
if (this.errorCount < threshold) { if (this.errorCount < threshold) {
console.log(`health check ok, num errors ${this.errorCount} < ${threshold}`); this.logger.debug(`health check ok, num errors ${this.errorCount} < ${threshold}`);
res.writeHead(200); res.writeHead(200);
res.end(); res.end();
} }
return; return;
} }
console.log(`health check failed: ${this.errorCount} >= ${threshold}`); this.logger.error(`health check failed: ${this.errorCount} >= ${threshold}`);
res.writeHead(503); res.writeHead(503);
res.end(); res.end();
} }
@ -491,7 +478,7 @@ export class Crawler {
const size = await getDirSize(dir); const size = await getDirSize(dir);
if (size >= this.params.sizeLimit) { if (size >= this.params.sizeLimit) {
console.log(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`); this.logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
interrupt = true; interrupt = true;
this.clearOnExit = true; this.clearOnExit = true;
} }
@ -500,7 +487,7 @@ export class Crawler {
if (this.params.timeLimit) { if (this.params.timeLimit) {
const elapsed = (Date.now() - this.startTime) / 1000; const elapsed = (Date.now() - this.startTime) / 1000;
if (elapsed > this.params.timeLimit) { if (elapsed > this.params.timeLimit) {
console.log(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`); this.logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
interrupt = true; interrupt = true;
} }
} }
@ -522,9 +509,9 @@ export class Crawler {
if (!markDone) { if (!markDone) {
this.params.waitOnDone = false; this.params.waitOnDone = false;
this.clearOnExit = true; this.clearOnExit = true;
console.log("SIGNAL: Preparing for exit of this crawler instance only"); this.logger.info("SIGNAL: Preparing for exit of this crawler instance only");
} else { } else {
console.log("SIGNAL: Preparing for final exit of all crawlers"); this.logger.info("SIGNAL: Preparing for final exit of all crawlers");
this.finalExit = true; this.finalExit = true;
} }
} }
@ -539,7 +526,7 @@ export class Crawler {
if (this.params.healthCheckPort) { if (this.params.healthCheckPort) {
this.healthServer = http.createServer((...args) => this.healthCheck(...args)); this.healthServer = http.createServer((...args) => this.healthCheck(...args));
this.statusLog(`Healthcheck server started on ${this.params.healthCheckPort}`); this.logger.info(`Healthcheck server started on ${this.params.healthCheckPort}`);
this.healthServer.listen(this.params.healthCheckPort); this.healthServer.listen(this.params.healthCheckPort);
} }
@ -547,7 +534,7 @@ export class Crawler {
const driverUrl = new URL(this.params.driver, import.meta.url); const driverUrl = new URL(this.params.driver, import.meta.url);
this.driver = (await import(driverUrl)).default; this.driver = (await import(driverUrl)).default;
} catch(e) { } catch(e) {
console.warn(e); this.logger.warn(`Error importing driver ${this.params.driver}`, e);
return; return;
} }
@ -556,7 +543,7 @@ export class Crawler {
let initState = await this.crawlState.getStatus(); let initState = await this.crawlState.getStatus();
while (initState === "debug") { while (initState === "debug") {
console.log("Paused for debugging, will continue after manual resume"); this.logger.info("Paused for debugging, will continue after manual resume");
await this.sleep(60); await this.sleep(60);
@ -568,7 +555,7 @@ export class Crawler {
this.done = true; this.done = true;
if (this.params.waitOnDone) { if (this.params.waitOnDone) {
this.statusLog("Already done, waiting for signal to exit..."); this.logger.info("Already done, waiting for signal to exit...");
// wait forever until signal // wait forever until signal
await new Promise(() => {}); await new Promise(() => {});
@ -589,7 +576,7 @@ export class Crawler {
timeout: this.params.timeout * 2, timeout: this.params.timeout * 2,
puppeteerOptions: this.puppeteerArgs, puppeteerOptions: this.puppeteerArgs,
puppeteer, puppeteer,
monitor: this.params.logging.includes("stats") monitor: false
}); });
@ -606,11 +593,11 @@ export class Crawler {
await this.initPages(); await this.initPages();
if (this.params.blockAds) { if (this.params.blockAds) {
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text)); this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger);
} }
if (this.params.blockRules && this.params.blockRules.length) { if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text)); this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, this.logger);
} }
this.screencaster = this.initScreenCaster(); this.screencaster = this.initScreenCaster();
@ -637,8 +624,6 @@ export class Crawler {
await this.serializeConfig(true); await this.serializeConfig(true);
this.writeStats();
if (this.pagesFH) { if (this.pagesFH) {
await this.pagesFH.sync(); await this.pagesFH.sync();
await this.pagesFH.close(); await this.pagesFH.close();
@ -652,27 +637,28 @@ export class Crawler {
} }
if (this.params.generateCDX) { if (this.params.generateCDX) {
this.statusLog("Generating CDX"); this.logger.info("Generating CDX");
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd}));
} }
await this.writeStats(true);
if (this.params.generateWACZ && (!this.interrupted || this.finalExit || this.clearOnExit)) { if (this.params.generateWACZ && (!this.interrupted || this.finalExit || this.clearOnExit)) {
await this.generateWACZ(); await this.generateWACZ();
if (this.clearOnExit) { if (this.clearOnExit) {
console.log(`Clearing ${this.collDir} before exit`); this.logger.info(`Clearing ${this.collDir} before exit`);
try { try {
fs.rmSync(this.collDir, { recursive: true, force: true }); fs.rmSync(this.collDir, { recursive: true, force: true });
} catch(e) { } catch(e) {
console.warn(e); this.logger.warn(`Unable to clear ${this.collDir} before exit`, e);
} }
} }
} }
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) { if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
this.done = true; this.done = true;
this.statusLog("All done, waiting for signal..."); this.logger.info("All done, waiting for signal...");
await this.crawlState.setStatus("done"); await this.crawlState.setStatus("done");
// wait forever until signal // wait forever until signal
@ -681,7 +667,7 @@ export class Crawler {
} }
async generateWACZ() { async generateWACZ() {
this.statusLog("Generating WACZ"); this.logger.info("Generating WACZ");
const archiveDir = path.join(this.collDir, "archive"); const archiveDir = path.join(this.collDir, "archive");
@ -691,13 +677,13 @@ export class Crawler {
// is finished (>0 pages and all pages written) // is finished (>0 pages and all pages written)
const isFinished = await this.crawlState.isFinished(); const isFinished = await this.crawlState.isFinished();
console.log(`Num WARC Files: ${warcFileList.length}`); this.logger.info(`Num WARC Files: ${warcFileList.length}`);
if (!warcFileList.length) { if (!warcFileList.length) {
// if finished, just return // if finished, just return
if (isFinished) { if (isFinished) {
return; return;
} }
throw new Error("No WARC Files, assuming crawl failed"); this.logger.fatal("No WARC Files, assuming crawl failed");
} }
// Build the argument list to pass to the wacz create command // Build the argument list to pass to the wacz create command
@ -720,14 +706,14 @@ export class Crawler {
warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
// create WACZ // create WACZ
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs, {stdio: "inherit"})); const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs));
if (waczResult !== 0) { if (waczResult !== 0) {
console.log("create result", waczResult); this.logger.error("Error creating WACZ", {"status code": waczResult});
throw new Error("Unable to write WACZ successfully"); this.logger.fatal("Unable to write WACZ successfully");
} }
this.debugLog(`WACZ successfully generated and saved to: ${waczPath}`); this.logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
// Verify WACZ // Verify WACZ
/* /*
@ -735,11 +721,11 @@ export class Crawler {
validateArgs.push("-f"); validateArgs.push("-f");
validateArgs.push(waczPath); validateArgs.push(waczPath);
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs, {stdio: "inherit"})); const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs));
if (waczVerifyResult !== 0) { if (waczVerifyResult !== 0) {
console.log("validate", waczVerifyResult); console.log("validate", waczVerifyResult);
throw new Error("Unable to verify WACZ created successfully"); this.logger.fatal("Unable to verify WACZ created successfully");
} }
*/ */
if (this.storage) { if (this.storage) {
@ -751,23 +737,43 @@ export class Crawler {
} }
awaitProcess(proc) { awaitProcess(proc) {
proc.stdout.on("data", (data) => {
this.logger.info(data.toString());
});
proc.stderr.on("data", (data) => {
this.logger.error(data.toString());
});
return new Promise((resolve) => { return new Promise((resolve) => {
proc.on("close", (code) => resolve(code)); proc.on("close", (code) => resolve(code));
}); });
} }
async writeStats() { async writeStats(toFile=false) {
if (this.params.statsFilename) { if (!this.params.logging.includes("stats")) {
const total = this.cluster.allTargetCount; return;
const workersRunning = this.cluster.workersBusy.length; }
const numCrawled = total - (await this.crawlState.size()) - workersRunning;
const limit = {max: this.params.limit || 0, hit: this.limitHit};
const stats = {numCrawled, workersRunning, total, limit};
const realSize = await this.crawlState.realSize();
const pending = await this.crawlState.numRealPending();
const done = await this.crawlState.numDone();
const total = realSize + pending + done;
const limit = {max: this.params.limit || 0, hit: this.limitHit};
const stats = {
"crawled": done,
"total": total,
"pending": pending,
"limit": limit,
"pendingPages": Array.from(this.crawlState.pending.values()).map(x => JSON.stringify(x))
};
this.logger.info("Crawl statistics", stats, "crawlState");
if (toFile && this.params.statsFilename) {
try { try {
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2)); await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
} catch (err) { } catch (err) {
console.warn("Stats output failed", err); this.logger.warn("Stats output failed", err);
} }
} }
} }
@ -807,26 +813,24 @@ export class Crawler {
// more serious page error, mark page session as invalid // more serious page error, mark page session as invalid
page.on("error", () => this.markPageFailed(page)); page.on("error", () => this.markPageFailed(page));
if (this.logConsole) { page.on("console", (msg) => {
page.on("console", (msg) => { if (this.params.logging.includes("jserrors") && (msg.type() === "error")) {
if (msg.type() === "error") { this.logger.warn(msg.text(), {"location": msg.location()}, "jsError");
console.log(msg.text(), msg.location()); }
} });
});
}
const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded"; const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded";
try { try {
await page.goto(url, gotoOpts); await page.goto(url, gotoOpts);
if (this.errorCount > 0) { if (this.errorCount > 0) {
this.statusLog(`Page loaded, resetting error count ${this.errorCount} to 0`); this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`);
this.errorCount = 0; this.errorCount = 0;
} }
} catch (e) { } catch (e) {
let msg = e.message || ""; let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) { if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
this.statusLog(`Load Error: ${url}: ${msg}`); this.logger.error(`Load Error: ${url}: ${msg}`);
this.errorCount++; this.errorCount++;
} }
} }
@ -875,7 +879,7 @@ export class Crawler {
try { try {
await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000}); await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000});
} catch (e) { } catch (e) {
console.log("note: waitForNetworkIdle timed out, ignoring"); this.logger.info("note: waitForNetworkIdle timed out, ignoring");
// ignore, continue // ignore, continue
} }
} }
@ -906,7 +910,7 @@ export class Crawler {
} }
} catch (e) { } catch (e) {
console.warn("Link Extraction failed", e); this.logger.warn("Link Extraction failed", e);
} }
return results; return results;
} }
@ -932,19 +936,18 @@ export class Crawler {
} }
} }
} catch (e) { } catch (e) {
console.error("Queuing Error: ", e); this.logger.error("Queuing Error", e);
} }
} }
async checkCF(page) { async checkCF(page) {
try { try {
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) { while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
this.statusLog("Cloudflare Check Detected, waiting for reload..."); this.logger.info("Cloudflare Check Detected, waiting for reload...");
await this.sleep(5.5); await this.sleep(5.5);
} }
} catch (e) { } catch (e) {
//console.warn("Check CF failed, ignoring"); this.logger.warn("Check CF failed, ignoring", e.message);
//console.warn(e);
} }
} }
@ -987,17 +990,17 @@ export class Crawler {
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}; const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
if (this.params.text) { if (this.params.text) {
header["hasText"] = true; header["hasText"] = true;
this.statusLog("Text Extraction: Enabled"); this.logger.info("Text Extraction: Enabled");
} else { } else {
header["hasText"] = false; header["hasText"] = false;
this.statusLog("Text Extraction: Disabled"); this.logger.info("Text Extraction: Disabled");
} }
const header_formatted = JSON.stringify(header).concat("\n"); const header_formatted = JSON.stringify(header).concat("\n");
await this.pagesFH.writeFile(header_formatted); await this.pagesFH.writeFile(header_formatted);
} }
} catch(err) { } catch(err) {
console.error("pages/pages.jsonl creation failed", err); this.logger.error("pages/pages.jsonl creation failed", err);
} }
} }
@ -1017,7 +1020,7 @@ export class Crawler {
try { try {
await this.pagesFH.writeFile(processedRow); await this.pagesFH.writeFile(processedRow);
} catch (err) { } catch (err) {
console.warn("pages/pages.jsonl append failed", err); this.logger.warn("pages/pages.jsonl append failed", err);
} }
} }
@ -1033,7 +1036,7 @@ export class Crawler {
agent: this.resolveAgent agent: this.resolveAgent
}); });
if (resp.status !== 200) { if (resp.status !== 200) {
this.debugLog(`Skipping HEAD check ${url}, invalid status ${resp.status}`); this.logger.debug(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
return true; return true;
} }
@ -1067,7 +1070,7 @@ export class Crawler {
} }
async awaitPendingClear() { async awaitPendingClear() {
this.statusLog("Waiting to ensure pending data is written to WARCs..."); this.logger.info("Waiting to ensure pending data is written to WARCs...");
const redis = await initRedis("redis://localhost/0"); const redis = await initRedis("redis://localhost/0");
@ -1080,7 +1083,7 @@ export class Crawler {
if (count <= 0) { if (count <= 0) {
break; break;
} }
this.debugLog(`Still waiting for ${count} pending requests to finish...`); this.logger.debug(`Still waiting for ${count} pending requests to finish...`);
} catch (e) { } catch (e) {
break; break;
} }
@ -1104,17 +1107,17 @@ export class Crawler {
const { sites } = await sitemapper.fetch(); const { sites } = await sitemapper.fetch();
await this.queueInScopeUrls(seedId, sites, 0); await this.queueInScopeUrls(seedId, sites, 0);
} catch(e) { } catch(e) {
console.warn(e); this.logger.warn("Error fetching sites from sitemap", e);
} }
} }
async combineWARC() { async combineWARC() {
this.statusLog("Generating Combined WARCs"); this.logger.info("Generating Combined WARCs");
// Get the list of created Warcs // Get the list of created Warcs
const warcLists = await fsp.readdir(path.join(this.collDir, "archive")); const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
this.debugLog(`Combining ${warcLists.length} WARCs...`); this.logger.debug(`Combining ${warcLists.length} WARCs...`);
const fileSizeObjects = []; // Used to sort the created warc by fileSize const fileSizeObjects = []; // Used to sort the created warc by fileSize
@ -1181,7 +1184,7 @@ export class Crawler {
fh.write(warcBuffer); fh.write(warcBuffer);
} }
this.debugLog(`Appending WARC ${fileSizeObjects[j].fileName}`); this.logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
const reader = fs.createReadStream(fileSizeObjects[j].fileName); const reader = fs.createReadStream(fileSizeObjects[j].fileName);
@ -1198,7 +1201,7 @@ export class Crawler {
await fh.end(); await fh.end();
} }
this.debugLog(`Combined WARCs saved as: ${generatedCombinedWarcs}`); this.logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
} }
async serializeConfig(done = false) { async serializeConfig(done = false) {
@ -1248,10 +1251,10 @@ export class Crawler {
} }
const res = yaml.dump(this.origConfig, {lineWidth: -1}); const res = yaml.dump(this.origConfig, {lineWidth: -1});
try { try {
this.statusLog("Saving crawl state to: " + filename); this.logger.info(`Saving crawl state to: ${filename}`);
await fsp.writeFile(filename, res); await fsp.writeFile(filename, res);
} catch (e) { } catch (e) {
console.error(`Failed to write save state file: ${filename}`, e); this.logger.error(`Failed to write save state file: ${filename}`, e);
return; return;
} }
@ -1259,11 +1262,11 @@ export class Crawler {
if (this.saveStateFiles.length > this.params.saveStateHistory) { if (this.saveStateFiles.length > this.params.saveStateHistory) {
const oldFilename = this.saveStateFiles.shift(); const oldFilename = this.saveStateFiles.shift();
this.statusLog(`Removing old save-state: ${oldFilename}`); this.logger.info(`Removing old save-state: ${oldFilename}`);
try { try {
await fsp.unlink(oldFilename); await fsp.unlink(oldFilename);
} catch (e) { } catch (e) {
console.error(`Failed to delete old save state file: ${oldFilename}`); this.logger.error(`Failed to delete old save state file: ${oldFilename}`);
} }
} }

17
main.js
View file

@ -1,6 +1,9 @@
#!/usr/bin/env -S node --experimental-global-webcrypto #!/usr/bin/env -S node --experimental-global-webcrypto
import { Crawler } from "./crawler.js"; import { Crawler } from "./crawler.js";
import { Logger } from "./util/logger.js";
const logger = new Logger();
var crawler = null; var crawler = null;
@ -9,29 +12,29 @@ let forceTerm = false;
async function handleTerminate(signame) { async function handleTerminate(signame) {
console.log(`${signame} received...`); logger.info(`${signame} received...`);
if (!crawler || !crawler.crawlState) { if (!crawler || !crawler.crawlState) {
console.log("error: no crawler running, exiting"); logger.error("error: no crawler running, exiting");
process.exit(1); process.exit(1);
} }
if (crawler.done) { if (crawler.done) {
console.log("success: crawler done, exiting"); logger.info("success: crawler done, exiting");
process.exit(0); process.exit(0);
} }
try { try {
if (!crawler.crawlState.drainMax) { if (!crawler.crawlState.drainMax) {
console.log("SIGNAL: gracefully finishing current pages..."); logger.info("SIGNAL: gracefully finishing current pages...");
crawler.gracefulFinish(); crawler.gracefulFinish();
} else if (forceTerm || (Date.now() - lastSigInt) > 200) { } else if (forceTerm || (Date.now() - lastSigInt) > 200) {
console.log("SIGNAL: stopping crawl now..."); logger.info("SIGNAL: stopping crawl now...");
await crawler.serializeAndExit(); await crawler.serializeAndExit();
} }
lastSigInt = Date.now(); lastSigInt = Date.now();
} catch (e) { } catch (e) {
console.log(e); logger.error("Error stopping crawl after receiving termination signal", e);
} }
} }
@ -40,7 +43,7 @@ process.on("SIGINT", () => handleTerminate("SIGINT"));
process.on("SIGTERM", () => handleTerminate("SIGTERM")); process.on("SIGTERM", () => handleTerminate("SIGTERM"));
process.on("SIGABRT", async () => { process.on("SIGABRT", async () => {
console.log("SIGABRT received, will force immediate exit on SIGTERM/SIGINT"); logger.info("SIGABRT received, will force immediate exit on SIGTERM/SIGINT");
forceTerm = true; forceTerm = true;
}); });

View file

@ -45,7 +45,7 @@ test("check yaml config file with seed list is used", async () => {
test("check yaml config file will be overwritten by command line", async () => { test("check yaml config file will be overwritten by command line", async () => {
try{ try{
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000"); await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000");
} }
catch (error) { catch (error) {
console.log(error); console.log(error);
@ -61,7 +61,7 @@ test("check yaml config file will be overwritten by command line", async () => {
} }
} }
expect(pages.has("https://www.example.com/")).toBe(true); expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
expect(pages.size).toBe(1); expect(pages.size).toBe(1);
}); });

View file

@ -8,7 +8,7 @@ const exec = util.promisify(execCallback);
test("check that URLs are crawled 2 extra hops beyond depth", async () => { test("check that URLs are crawled 2 extra hops beyond depth", async () => {
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7"); await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7");
} }
catch (error) { catch (error) {
console.log(error); console.log(error);
@ -17,13 +17,13 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8"); const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
const expectedPages = [ const expectedPages = [
"https://example.com/", "https://webrecorder.net/",
"https://www.iana.org/domains/example", "https://webrecorder.net/blog",
"http://www.iana.org/", "https://webrecorder.net/tools",
"http://www.iana.org/domains", "https://webrecorder.net/community",
"http://www.iana.org/protocols", "https://webrecorder.net/about",
"http://www.iana.org/numbers", "https://webrecorder.net/contact",
"http://www.iana.org/about", "https://webrecorder.net/faq",
]; ];
for (const page of crawled_pages.trim().split("\n")) { for (const page of crawled_pages.trim().split("\n")) {

View file

@ -1,8 +1,8 @@
name: crawl-test-1 name: crawl-test-1
collection: configtest collection: configtest
seeds: seeds:
- https://www.example.org - https://webrecorder.net/
- https://www.iana.org/ - https://specs.webrecorder.net/
generateWACZ: true generateWACZ: true

View file

@ -1,2 +1,2 @@
https://www.example.org https://webrecorder.net/about/
https://www.example.com https://specs.webrecorder.net/wacz/1.1.1/

View file

@ -4,10 +4,10 @@ import fs from "fs";
const exec = util.promisify(execCallback); const exec = util.promisify(execCallback);
test("check that URLs one-depth out from the seed-list are crawled", async () => { test("check that URLs in seed-list are crawled", async () => {
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000"); await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000");
} }
catch (error) { catch (error) {
console.log(error); console.log(error);

View file

@ -13,6 +13,9 @@ import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js"; import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js"; import { screenshotTypes } from "./screenshots.js";
import { Logger } from "./logger.js";
const logger = new Logger();
// ============================================================================ // ============================================================================
@ -171,7 +174,7 @@ class ArgParser {
}, },
"logging": { "logging": {
describe: "Logging options for crawler, can include: stats, pywb, behaviors, behaviors-debug, jserrors", describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
type: "string", type: "string",
default: "stats", default: "stats",
}, },
@ -351,7 +354,7 @@ class ArgParser {
// Check that the collection name is valid. // Check that the collection name is valid.
if (argv.collection.search(/^[\w][\w-]*$/) === -1){ if (argv.collection.search(/^[\w][\w-]*$/) === -1){
throw new Error(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`); logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
} }
argv.timeout *= 1000; argv.timeout *= 1000;
@ -365,7 +368,7 @@ class ArgParser {
for (const opt of argv.waitUntil) { for (const opt of argv.waitUntil) {
if (!WAIT_UNTIL_OPTS.includes(opt)) { if (!WAIT_UNTIL_OPTS.includes(opt)) {
throw new Error("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(",")); logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
} }
} }
@ -394,30 +397,25 @@ class ArgParser {
if (argv.behaviorTimeout) { if (argv.behaviorTimeout) {
behaviorOpts.timeout = argv.behaviorTimeout *= 1000; behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
} }
if (argv.logging.includes("behaviors")) { behaviorOpts.log = BEHAVIOR_LOG_FUNC;
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
} else if (argv.logging.includes("behaviors-debug")) {
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
argv.behaviorsLogDebug = true;
}
argv.behaviorOpts = JSON.stringify(behaviorOpts); argv.behaviorOpts = JSON.stringify(behaviorOpts);
if (argv.newContext) { if (argv.newContext) {
console.log("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored"); logger.info("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored");
} }
if (argv.workers > 1) { if (argv.workers > 1) {
console.log("Window context being used to support >1 workers"); logger.info("Window context being used to support >1 workers");
argv.newContext = ReuseWindowConcurrency; argv.newContext = ReuseWindowConcurrency;
} else { } else {
console.log("Page context being used with 1 worker"); logger.info("Page context being used with 1 worker");
argv.newContext = Cluster.CONCURRENCY_PAGE; argv.newContext = Cluster.CONCURRENCY_PAGE;
} }
if (argv.mobileDevice) { if (argv.mobileDevice) {
argv.emulateDevice = puppeteer.devices[argv.mobileDevice]; argv.emulateDevice = puppeteer.devices[argv.mobileDevice];
if (!argv.emulateDevice) { if (!argv.emulateDevice) {
throw new Error("Unknown device: " + argv.mobileDevice); logger.fatal("Unknown device: " + argv.mobileDevice);
} }
} }
@ -442,13 +440,13 @@ class ArgParser {
} else { } else {
argv.netIdleWait = 2; argv.netIdleWait = 2;
} }
console.log(`Set netIdleWait to ${argv.netIdleWait} seconds`); logger.info(`Set netIdleWait to ${argv.netIdleWait} seconds`);
} }
// prefer argv.include only if string or a non-empty array // prefer argv.include only if string or a non-empty array
if (argv.include && (typeof(argv.include) === "string" || argv.include.length)) { if (argv.include && (typeof(argv.include) === "string" || argv.include.length)) {
if (argv.scopeType && argv.scopeType !== "custom") { if (argv.scopeType && argv.scopeType !== "custom") {
console.warn("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType"); logger.info("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");
argv.scopeType = "custom"; argv.scopeType = "custom";
} }
} }

View file

@ -1,5 +1,9 @@
import fs from "fs"; import fs from "fs";
import { Logger } from "./logger.js";
const logger = new Logger();
const RULE_TYPES = ["block", "allowOnly"]; const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"]; const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
@ -28,7 +32,7 @@ class BlockRule
} }
if (!RULE_TYPES.includes(this.type)) { if (!RULE_TYPES.includes(this.type)) {
throw new Error("Rule \"type\" must be: " + RULE_TYPES.join(", ")); logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
} }
} }
@ -47,11 +51,11 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
// =========================================================================== // ===========================================================================
export class BlockRules export class BlockRules
{ {
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) { constructor(blockRules, blockPutUrl, blockErrMsg, logger) {
this.rules = []; this.rules = [];
this.blockPutUrl = blockPutUrl; this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg; this.blockErrMsg = blockErrMsg;
this.debugLog = debugLog; this.logger = logger;
this.blockedUrlSet = new Set(); this.blockedUrlSet = new Set();
@ -60,9 +64,9 @@ export class BlockRules
} }
if (this.rules.length) { if (this.rules.length) {
this.debugLog("URL Block Rules:\n"); this.logger.debug("URL Block Rules:\n");
for (const rule of this.rules) { for (const rule of this.rules) {
this.debugLog(rule.toString()); this.logger.debug(rule.toString());
} }
} }
} }
@ -84,7 +88,7 @@ export class BlockRules
try { try {
await this.handleRequest(request); await this.handleRequest(request);
} catch (e) { } catch (e) {
console.warn(e); this.logger.warn("Error handling request", e);
} }
}); });
} }
@ -104,7 +108,7 @@ export class BlockRules
} }
} catch (e) { } catch (e) {
this.debugLog(`Block: (${blockState}) Failed On: ${url} Reason: ${e}`); this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, e);
} }
} }
@ -151,10 +155,10 @@ export class BlockRules
if (block) { if (block) {
if (blockState === BlockState.BLOCK_PAGE_NAV) { if (blockState === BlockState.BLOCK_PAGE_NAV) {
console.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`); this.logger.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`);
return BlockState.ALLOW; return BlockState.ALLOW;
} }
this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`); this.logger.debug(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
await this.recordBlockMsg(url); await this.recordBlockMsg(url);
return blockState; return blockState;
} }
@ -187,7 +191,7 @@ export class BlockRules
} }
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly; const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`); this.logger.debug(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
return {block, done: true}; return {block, done: true};
} }
@ -204,7 +208,7 @@ export class BlockRules
return !!text.match(frameTextMatch); return !!text.match(frameTextMatch);
} catch (e) { } catch (e) {
this.debugLog(e); this.logger.debug("Error determining text match", e);
} }
} }
@ -230,8 +234,8 @@ export class BlockRules
// =========================================================================== // ===========================================================================
export class AdBlockRules extends BlockRules export class AdBlockRules extends BlockRules
{ {
constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") { constructor(blockPutUrl, blockErrMsg, logger, adhostsFilePath = "../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg, debugLog); super([], blockPutUrl, blockErrMsg, logger);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url))); this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
} }
@ -248,7 +252,7 @@ export class AdBlockRules extends BlockRules
try { try {
await this.handleRequest(request); await this.handleRequest(request);
} catch (e) { } catch (e) {
console.warn(e); this.logger.warn("Error handling request", e);
} }
}); });
} }
@ -257,7 +261,7 @@ export class AdBlockRules extends BlockRules
const fragments = url.split("/"); const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null; const domain = fragments.length > 2 ? fragments[2] : null;
if (this.adhosts.includes(domain)) { if (this.adhosts.includes(domain)) {
this.debugLog(`URL blocked for being an ad: ${url}`); this.logger.debug(`URL blocked for being an ad: ${url}`);
await this.recordBlockMsg(url); await this.recordBlockMsg(url);
return BlockState.BLOCK_AD; return BlockState.BLOCK_AD;
} }

View file

@ -4,6 +4,9 @@ import path from "path";
import os from "os"; import os from "os";
import request from "request"; import request from "request";
import { initStorage } from "./storage.js"; import { initStorage } from "./storage.js";
import { Logger } from "./logger.js";
const logger = new Logger();
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-")); const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
@ -13,7 +16,7 @@ export async function loadProfile(profileFilename) {
if (profileFilename && if (profileFilename &&
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) { (profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
console.log(`Downloading ${profileFilename} to ${targetFilename}`); logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
const p = new Promise((resolve, reject) => { const p = new Promise((resolve, reject) => {
request.get(profileFilename). request.get(profileFilename).
@ -29,7 +32,7 @@ export async function loadProfile(profileFilename) {
const storage = initStorage(""); const storage = initStorage("");
if (!storage) { if (!storage) {
throw new Error("Profile specified relative to s3 storage, but no S3 storage defined"); logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
} }
await storage.downloadFile(profileFilename.slice(1), targetFilename); await storage.downloadFile(profileFilename.slice(1), targetFilename);
@ -41,7 +44,7 @@ export async function loadProfile(profileFilename) {
try { try {
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir}); child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
} catch (e) { } catch (e) {
console.error(`Profile filename ${profileFilename} not a valid tar.gz`); logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
} }
} }
@ -72,7 +75,7 @@ export function getDefaultUA() {
version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"}); version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"});
version = version.match(/[\d.]+/)[0]; version = version.match(/[\d.]+/)[0];
} catch(e) { } catch(e) {
console.error(e); logger.error("Error getting default UserAgent", e);
} }
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
@ -158,7 +161,7 @@ export async function evaluateWithCLI(frame, funcString) {
}); });
if (exceptionDetails) { if (exceptionDetails) {
throw new Error( logger.fatal(
"Behavior Evaluation Failed" + exceptionDetails.text "Behavior Evaluation Failed" + exceptionDetails.text
); );
} }

45
util/logger.js Normal file
View file

@ -0,0 +1,45 @@
// ===========================================================================
export class Logger
{
constructor(debugLogging=false) {
this.debugLogging = debugLogging;
}
logAsJSON(message, data, context, logLevel="info") {
if (typeof data !== "object") {
data = {"message": data.toString()};
}
let dataToLog = {
"logLevel": logLevel,
"timestamp": new Date().toISOString(),
"context": context,
"message": message,
"details": data ? data : {}
};
console.log(JSON.stringify(dataToLog));
}
info(message, data={}, context="general") {
this.logAsJSON(message, data, context);
}
error(message, data={}, context="general") {
this.logAsJSON(message, data, context, "error");
}
warn(message, data={}, context="general") {
this.logAsJSON(message, data, context, "warn");
}
debug(message, data={}, context="general") {
if (this.debugLogging) {
this.logAsJSON(message, data, context, "debug");
}
}
fatal(message, data={}, context="general", exitCode=1) {
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
process.exit(exitCode);
}
}

View file

@ -2,6 +2,9 @@ import fs from "fs";
import path from "path"; import path from "path";
import * as warcio from "warcio"; import * as warcio from "warcio";
import { Logger } from "./logger.js";
const logger = new Logger();
// ============================================================================ // ============================================================================
@ -43,9 +46,9 @@ export class Screenshots {
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, options.type); const warcRecord = await this.wrap(screenshotBuffer, screenshotType, options.type);
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true}); const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
fs.appendFileSync(this.warcName, warcRecordBuffer); fs.appendFileSync(this.warcName, warcRecordBuffer);
console.log(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`); logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
} catch (e) { } catch (e) {
console.log(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e); logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
} }
} }

View file

@ -1,7 +1,15 @@
import { Logger } from "./logger.js";
const logger = new Logger();
export class ScopedSeed export class ScopedSeed
{ {
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) { constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
const parsedUrl = this.parseUrl(url); const parsedUrl = this.parseUrl(url);
if (!parsedUrl) {
logger.fatal(`Invalid Seed "${url}" - not a valid URL`);
}
this.url = parsedUrl.href; this.url = parsedUrl.href;
this.include = this.parseRx(include); this.include = this.parseRx(include);
this.exclude = this.parseRx(exclude); this.exclude = this.parseRx(exclude);
@ -36,11 +44,12 @@ export class ScopedSeed
try { try {
parsedUrl = new URL(url.trim()); parsedUrl = new URL(url.trim());
} catch (e) { } catch (e) {
throw new Error(`Invalid Seed "${url}" - not a valid URL`); logger.error(`Invalid Seed "${url}" - not a valid URL`);
} }
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") { if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
throw new Error(`Invalid Seed "${url}" - URL must start with http:// or https://`); logger.error(`Invalid Seed "${url}" - URL must start with http:// or https://`);
parsedUrl = null;
} }
return parsedUrl; return parsedUrl;
@ -91,7 +100,7 @@ export class ScopedSeed
break; break;
default: default:
throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`); logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
} }
return [include, allowHash]; return [include, allowHash];
@ -106,9 +115,8 @@ export class ScopedSeed
return false; return false;
} }
try { url = this.parseUrl(url);
url = this.parseUrl(url); if (!url) {
} catch(e) {
return false; return false;
} }

View file

@ -1,6 +1,11 @@
import mod from "puppeteer-cluster/dist/Job.js"; import mod from "puppeteer-cluster/dist/Job.js";
import { Logger } from "./logger.js";
const Job = mod.default; const Job = mod.default;
const logger = new Logger();
// ============================================================================ // ============================================================================
export class BaseState export class BaseState
@ -102,7 +107,7 @@ export class MemoryCrawlState extends BaseState
}, },
reject(e) { reject(e) {
console.warn(`Page Load Failed: ${url}, Reason: ${e}`); logger.warn(`Page Load Failed: ${url}`, e.message);
state.pending.delete(url); state.pending.delete(url);
@ -316,7 +321,7 @@ return 0;
try { try {
data = JSON.parse(json); data = JSON.parse(json);
} catch(e) { } catch(e) {
console.error("Invalid queued json: ", json); logger.error("Invalid queued json", json);
return null; return null;
} }
@ -338,7 +343,7 @@ return 0;
}, },
async reject(e) { async reject(e) {
console.warn(`Page Load Failed: ${url}, Reason: ${e}`); logger.warn(`Page Load Failed: ${url}`, e.message);
await state._fail(url); await state._fail(url);
} }
}; };
@ -445,7 +450,7 @@ return 0;
for (const url of pendingUrls) { for (const url of pendingUrls) {
if (await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url)) { if (await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url)) {
console.log("Requeued: " + url); logger.info(`Requeued: ${url}`);
} }
} }
} }

View file

@ -7,9 +7,12 @@ import { createHash } from "crypto";
import Minio from "minio"; import Minio from "minio";
import { initRedis } from "./redis.js"; import { initRedis } from "./redis.js";
import { Logger } from "./logger.js";
import getFolderSize from "get-folder-size"; import getFolderSize from "get-folder-size";
const logger = new Logger();
// =========================================================================== // ===========================================================================
export class S3StorageSync export class S3StorageSync
@ -57,10 +60,13 @@ export class S3StorageSync
} }
async uploadFile(srcFilename, targetFilename) { async uploadFile(srcFilename, targetFilename) {
console.log(`Bucket: ${this.bucketName}`); const fileUploadInfo = {
console.log(`Crawl Id: ${this.crawlId}`); "bucket": this.bucketName,
console.log(`Prefix: ${this.objectPrefix}`); "crawlId": this.crawlId,
console.log(`Target Filename: ${targetFilename}`); "prefix": this.objectPrefix,
"targetFilename": this.targetFilename
};
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename); await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
@ -76,7 +82,7 @@ export class S3StorageSync
async uploadCollWACZ(srcFilename, targetFilename, completed = true) { async uploadCollWACZ(srcFilename, targetFilename, completed = true) {
const resource = await this.uploadFile(srcFilename, targetFilename); const resource = await this.uploadFile(srcFilename, targetFilename);
console.log(resource); logger.info("WACZ S3 file upload resource", resource, "s3Upload");
if (this.webhookUrl) { if (this.webhookUrl) {
const body = { const body = {
@ -92,14 +98,14 @@ export class S3StorageSync
completed completed
}; };
console.log("Pinging Webhook: " + this.webhookUrl); logger.info(`Pinging Webhook: ${this.webhookUrl}`);
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) { if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) {
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)}); await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)});
} else if (this.webhookUrl.startsWith("redis://")) { } else if (this.webhookUrl.startsWith("redis://")) {
const parts = this.webhookUrl.split("/"); const parts = this.webhookUrl.split("/");
if (parts.length !== 5) { if (parts.length !== 5) {
throw new Error("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>"); logger.fatal("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
} }
const redis = await initRedis(parts.slice(0, 4).join("/")); const redis = await initRedis(parts.slice(0, 4).join("/"));
await redis.rpush(parts[4], JSON.stringify(body)); await redis.rpush(parts[4], JSON.stringify(body));

View file

@ -1,5 +1,9 @@
import sbi from "puppeteer-cluster/dist/concurrency/SingleBrowserImplementation.js"; import sbi from "puppeteer-cluster/dist/concurrency/SingleBrowserImplementation.js";
import { Logger } from "./logger.js";
const logger = new Logger();
const SingleBrowserImplementation = sbi.default; const SingleBrowserImplementation = sbi.default;
@ -40,7 +44,7 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
} }
this.repairing = true; this.repairing = true;
console.debug("Starting repair"); logger.warn("Starting browser repair");
if (this.screencaster) { if (this.screencaster) {
this.screencaster.endAllTargets(); this.screencaster.endAllTargets();
@ -50,13 +54,13 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
// will probably fail, but just in case the repair was not necessary // will probably fail, but just in case the repair was not necessary
await this.browser.close(); await this.browser.close();
} catch (e) { } catch (e) {
console.debug("Unable to close browser."); logger.warn("Unable to close browser");
} }
try { try {
await this.init(); await this.init();
} catch (err) { } catch (err) {
console.debug("Unable to restart chrome."); logger.warn("Unable to restart chrome");
} }
this.repairRequested = false; this.repairRequested = false;
this.repairing = false; this.repairing = false;
@ -71,7 +75,7 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
const res = await this.cdp.send("Target.createTarget", {url: this.startPage, newWindow: true}); const res = await this.cdp.send("Target.createTarget", {url: this.startPage, newWindow: true});
targetId = res.targetId; targetId = res.targetId;
} catch (e) { } catch (e) {
console.warn(e); logger.warn("Error getting new page in window context", e);
await this.repair(); await this.repair();
} }