mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Implement improved json-l logging
- Add Logger class with methods for info, error, warn, debug, fatal - Add context, timestamp, and details fields to log entries - Log messages as JSON Lines - Replace puppeteer-cluster stats with custom stats implementation - Log behaviors by default - Amend argParser to reflect logging changes - Capture and log stdout/stderr from awaited child_processes - Modify tests to use webrecorder.net to avoid timeouts
This commit is contained in:
parent
2b03e23174
commit
0192d05f4c
16 changed files with 276 additions and 194 deletions
229
crawler.js
229
crawler.js
|
@ -23,6 +23,7 @@ import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screenca
|
|||
import { Screenshots } from "./util/screenshots.js";
|
||||
import { parseArgs } from "./util/argParser.js";
|
||||
import { initRedis } from "./util/redis.js";
|
||||
import { Logger } from "./util/logger.js";
|
||||
|
||||
import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } from "./util/browser.js";
|
||||
|
||||
|
@ -46,6 +47,13 @@ const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/
|
|||
// ============================================================================
|
||||
export class Crawler {
|
||||
constructor() {
|
||||
const res = parseArgs();
|
||||
this.params = res.parsed;
|
||||
this.origConfig = res.origConfig;
|
||||
|
||||
const debugLogging = this.params.logging.includes("debug");
|
||||
this.logger = new Logger(debugLogging);
|
||||
|
||||
this.headers = {};
|
||||
this.crawlState = null;
|
||||
|
||||
|
@ -63,23 +71,17 @@ export class Crawler {
|
|||
|
||||
this.userAgent = "";
|
||||
|
||||
const res = parseArgs();
|
||||
this.params = res.parsed;
|
||||
this.origConfig = res.origConfig;
|
||||
|
||||
this.saveStateFiles = [];
|
||||
this.lastSaveTime = 0;
|
||||
this.saveStateInterval = this.params.saveStateInterval * 1000;
|
||||
|
||||
this.debugLogging = this.params.logging.includes("debug");
|
||||
this.saveStateInterval = this.params.saveStateInterval * 1000;
|
||||
|
||||
if (this.params.profile) {
|
||||
this.statusLog("With Browser Profile: " + this.params.profile);
|
||||
this.logger.info(`With Browser Profile: ${this.params.profile}"`);
|
||||
}
|
||||
|
||||
this.emulateDevice = this.params.emulateDevice;
|
||||
|
||||
this.debugLog("Seeds", this.params.scopedSeeds);
|
||||
this.logger.info("Seeds", this.params.scopedSeeds);
|
||||
|
||||
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
||||
this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
|
||||
|
@ -110,18 +112,6 @@ export class Crawler {
|
|||
this.done = false;
|
||||
|
||||
this.behaviorLastLine = null;
|
||||
|
||||
this.logConsole = false;
|
||||
}
|
||||
|
||||
statusLog(...args) {
|
||||
console.log(...args);
|
||||
}
|
||||
|
||||
debugLog(...args) {
|
||||
if (this.debugLogging) {
|
||||
console.log(...args);
|
||||
}
|
||||
}
|
||||
|
||||
configureUA() {
|
||||
|
@ -158,7 +148,7 @@ export class Crawler {
|
|||
|
||||
if (redisUrl) {
|
||||
if (!redisUrl.startsWith("redis://")) {
|
||||
throw new Error("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
|
||||
this.logger.fatal("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
|
||||
}
|
||||
|
||||
let redis;
|
||||
|
@ -168,24 +158,24 @@ export class Crawler {
|
|||
redis = await initRedis(redisUrl);
|
||||
break;
|
||||
} catch (e) {
|
||||
//throw new Error("Unable to connect to state store Redis: " + redisUrl);
|
||||
console.warn(`Waiting for redis at ${redisUrl}`);
|
||||
//this.logger.fatal("Unable to connect to state store Redis: " + redisUrl);
|
||||
this.logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
|
||||
await this.sleep(3);
|
||||
}
|
||||
}
|
||||
|
||||
this.statusLog(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`);
|
||||
this.logger.info(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
|
||||
|
||||
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname());
|
||||
|
||||
} else {
|
||||
this.statusLog("Storing state in memory");
|
||||
this.logger.info("Storing state in memory", {}, "state");
|
||||
|
||||
this.crawlState = new MemoryCrawlState();
|
||||
}
|
||||
|
||||
if (this.params.saveState === "always" && this.params.saveStateInterval) {
|
||||
this.statusLog(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`);
|
||||
this.logger.info(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
|
||||
}
|
||||
|
||||
return this.crawlState;
|
||||
|
@ -196,10 +186,10 @@ export class Crawler {
|
|||
|
||||
if (this.params.screencastPort) {
|
||||
transport = new WSTransport(this.params.screencastPort);
|
||||
this.debugLog(`Screencast server started on: ${this.params.screencastPort}`);
|
||||
this.logger.debug(`Screencast server started on: ${this.params.screencastPort}`, {}, "screencast");
|
||||
} else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
||||
transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
|
||||
this.debugLog("Screencast enabled via redis pubsub");
|
||||
this.logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
|
||||
}
|
||||
|
||||
if (!transport) {
|
||||
|
@ -215,7 +205,7 @@ export class Crawler {
|
|||
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
|
||||
|
||||
if (initRes.status) {
|
||||
console.log("wb-manager init failed, collection likely already exists");
|
||||
this.logger.info("wb-manager init failed, collection likely already exists");
|
||||
}
|
||||
|
||||
await fsp.mkdir(logs, {recursive: true});
|
||||
|
@ -236,8 +226,6 @@ export class Crawler {
|
|||
redisStdio = "ignore";
|
||||
}
|
||||
|
||||
this.logConsole = this.params.logging.includes("jserrors");
|
||||
|
||||
this.browserExe = getBrowserExe();
|
||||
|
||||
this.configureUA();
|
||||
|
@ -249,11 +237,11 @@ export class Crawler {
|
|||
subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio}));
|
||||
|
||||
if (this.params.overwrite) {
|
||||
console.log(`Clearing ${this.collDir} before starting`);
|
||||
this.logger.info(`Clearing ${this.collDir} before starting`);
|
||||
try {
|
||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||
} catch(e) {
|
||||
console.warn(e);
|
||||
this.logger.error(`Unable to clear ${this.collDir}`, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -318,8 +306,7 @@ export class Crawler {
|
|||
await this.crawl();
|
||||
status = (!this.interrupted ? "done" : "interrupted");
|
||||
} catch(e) {
|
||||
console.error("Crawl failed");
|
||||
console.error(e);
|
||||
this.logger.error("Crawl failed", e);
|
||||
exitCode = 9;
|
||||
status = "failing";
|
||||
if (await this.crawlState.incFailCount()) {
|
||||
|
@ -327,7 +314,7 @@ export class Crawler {
|
|||
}
|
||||
|
||||
} finally {
|
||||
console.log(status);
|
||||
this.logger.info(`Crawl status: ${status}`);
|
||||
|
||||
if (this.crawlState) {
|
||||
await this.crawlState.setStatus(status);
|
||||
|
@ -344,16 +331,14 @@ export class Crawler {
|
|||
case "info":
|
||||
behaviorLine = JSON.stringify(data);
|
||||
if (behaviorLine != this._behaviorLastLine) {
|
||||
console.log(behaviorLine);
|
||||
this.logger.info("Behavior line", data, "behavior");
|
||||
this._behaviorLastLine = behaviorLine;
|
||||
}
|
||||
break;
|
||||
|
||||
case "debug":
|
||||
default:
|
||||
if (this.params.behaviorsLogDebug) {
|
||||
console.log("behavior debug: " + JSON.stringify(data));
|
||||
}
|
||||
this.logger.debug("Behavior debug", data, "behavior");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -367,7 +352,7 @@ export class Crawler {
|
|||
const {page, data} = opts;
|
||||
|
||||
if (!this.isInScope(data)) {
|
||||
console.log(`No longer in scope: ${JSON.stringify(data)}`);
|
||||
this.logger.info("Page no longer in scope", data);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -425,7 +410,7 @@ export class Crawler {
|
|||
|
||||
if (this.params.behaviorOpts) {
|
||||
if (!page.isHTMLPage) {
|
||||
console.log("Skipping behaviors for non-HTML page");
|
||||
this.logger.info("Skipping behaviors for non-HTML page", data, "behavior");
|
||||
} else {
|
||||
await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")));
|
||||
|
||||
|
@ -434,6 +419,8 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
this.logger.info("Page graph data for successfully crawled page", data, "pageGraph");
|
||||
|
||||
await this.writeStats();
|
||||
|
||||
await this.checkLimits();
|
||||
|
@ -441,7 +428,7 @@ export class Crawler {
|
|||
await this.serializeConfig();
|
||||
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
this.logger.error(`Error crawling page ${data.url}`, e.message);
|
||||
await this.markPageFailed(page);
|
||||
}
|
||||
}
|
||||
|
@ -470,14 +457,14 @@ export class Crawler {
|
|||
switch (pathname) {
|
||||
case "/healthz":
|
||||
if (this.errorCount < threshold) {
|
||||
console.log(`health check ok, num errors ${this.errorCount} < ${threshold}`);
|
||||
this.logger.debug(`health check ok, num errors ${this.errorCount} < ${threshold}`);
|
||||
res.writeHead(200);
|
||||
res.end();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`health check failed: ${this.errorCount} >= ${threshold}`);
|
||||
this.logger.error(`health check failed: ${this.errorCount} >= ${threshold}`);
|
||||
res.writeHead(503);
|
||||
res.end();
|
||||
}
|
||||
|
@ -491,7 +478,7 @@ export class Crawler {
|
|||
const size = await getDirSize(dir);
|
||||
|
||||
if (size >= this.params.sizeLimit) {
|
||||
console.log(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
|
||||
this.logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
|
||||
interrupt = true;
|
||||
this.clearOnExit = true;
|
||||
}
|
||||
|
@ -500,7 +487,7 @@ export class Crawler {
|
|||
if (this.params.timeLimit) {
|
||||
const elapsed = (Date.now() - this.startTime) / 1000;
|
||||
if (elapsed > this.params.timeLimit) {
|
||||
console.log(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
|
||||
this.logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
|
||||
interrupt = true;
|
||||
}
|
||||
}
|
||||
|
@ -522,9 +509,9 @@ export class Crawler {
|
|||
if (!markDone) {
|
||||
this.params.waitOnDone = false;
|
||||
this.clearOnExit = true;
|
||||
console.log("SIGNAL: Preparing for exit of this crawler instance only");
|
||||
this.logger.info("SIGNAL: Preparing for exit of this crawler instance only");
|
||||
} else {
|
||||
console.log("SIGNAL: Preparing for final exit of all crawlers");
|
||||
this.logger.info("SIGNAL: Preparing for final exit of all crawlers");
|
||||
this.finalExit = true;
|
||||
}
|
||||
}
|
||||
|
@ -539,7 +526,7 @@ export class Crawler {
|
|||
|
||||
if (this.params.healthCheckPort) {
|
||||
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
|
||||
this.statusLog(`Healthcheck server started on ${this.params.healthCheckPort}`);
|
||||
this.logger.info(`Healthcheck server started on ${this.params.healthCheckPort}`);
|
||||
this.healthServer.listen(this.params.healthCheckPort);
|
||||
}
|
||||
|
||||
|
@ -547,7 +534,7 @@ export class Crawler {
|
|||
const driverUrl = new URL(this.params.driver, import.meta.url);
|
||||
this.driver = (await import(driverUrl)).default;
|
||||
} catch(e) {
|
||||
console.warn(e);
|
||||
this.logger.warn(`Error importing driver ${this.params.driver}`, e);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -556,7 +543,7 @@ export class Crawler {
|
|||
let initState = await this.crawlState.getStatus();
|
||||
|
||||
while (initState === "debug") {
|
||||
console.log("Paused for debugging, will continue after manual resume");
|
||||
this.logger.info("Paused for debugging, will continue after manual resume");
|
||||
|
||||
await this.sleep(60);
|
||||
|
||||
|
@ -568,7 +555,7 @@ export class Crawler {
|
|||
this.done = true;
|
||||
|
||||
if (this.params.waitOnDone) {
|
||||
this.statusLog("Already done, waiting for signal to exit...");
|
||||
this.logger.info("Already done, waiting for signal to exit...");
|
||||
|
||||
// wait forever until signal
|
||||
await new Promise(() => {});
|
||||
|
@ -589,7 +576,7 @@ export class Crawler {
|
|||
timeout: this.params.timeout * 2,
|
||||
puppeteerOptions: this.puppeteerArgs,
|
||||
puppeteer,
|
||||
monitor: this.params.logging.includes("stats")
|
||||
monitor: false
|
||||
});
|
||||
|
||||
|
||||
|
@ -606,11 +593,11 @@ export class Crawler {
|
|||
await this.initPages();
|
||||
|
||||
if (this.params.blockAds) {
|
||||
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text));
|
||||
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger);
|
||||
}
|
||||
|
||||
if (this.params.blockRules && this.params.blockRules.length) {
|
||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
|
||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, this.logger);
|
||||
}
|
||||
|
||||
this.screencaster = this.initScreenCaster();
|
||||
|
@ -637,8 +624,6 @@ export class Crawler {
|
|||
|
||||
await this.serializeConfig(true);
|
||||
|
||||
this.writeStats();
|
||||
|
||||
if (this.pagesFH) {
|
||||
await this.pagesFH.sync();
|
||||
await this.pagesFH.close();
|
||||
|
@ -652,27 +637,28 @@ export class Crawler {
|
|||
}
|
||||
|
||||
if (this.params.generateCDX) {
|
||||
this.statusLog("Generating CDX");
|
||||
|
||||
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd}));
|
||||
this.logger.info("Generating CDX");
|
||||
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
}
|
||||
|
||||
await this.writeStats(true);
|
||||
|
||||
if (this.params.generateWACZ && (!this.interrupted || this.finalExit || this.clearOnExit)) {
|
||||
await this.generateWACZ();
|
||||
|
||||
if (this.clearOnExit) {
|
||||
console.log(`Clearing ${this.collDir} before exit`);
|
||||
this.logger.info(`Clearing ${this.collDir} before exit`);
|
||||
try {
|
||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||
} catch(e) {
|
||||
console.warn(e);
|
||||
this.logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
|
||||
this.done = true;
|
||||
this.statusLog("All done, waiting for signal...");
|
||||
this.logger.info("All done, waiting for signal...");
|
||||
await this.crawlState.setStatus("done");
|
||||
|
||||
// wait forever until signal
|
||||
|
@ -681,7 +667,7 @@ export class Crawler {
|
|||
}
|
||||
|
||||
async generateWACZ() {
|
||||
this.statusLog("Generating WACZ");
|
||||
this.logger.info("Generating WACZ");
|
||||
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
|
||||
|
@ -691,13 +677,13 @@ export class Crawler {
|
|||
// is finished (>0 pages and all pages written)
|
||||
const isFinished = await this.crawlState.isFinished();
|
||||
|
||||
console.log(`Num WARC Files: ${warcFileList.length}`);
|
||||
this.logger.info(`Num WARC Files: ${warcFileList.length}`);
|
||||
if (!warcFileList.length) {
|
||||
// if finished, just return
|
||||
if (isFinished) {
|
||||
return;
|
||||
}
|
||||
throw new Error("No WARC Files, assuming crawl failed");
|
||||
this.logger.fatal("No WARC Files, assuming crawl failed");
|
||||
}
|
||||
|
||||
// Build the argument list to pass to the wacz create command
|
||||
|
@ -720,14 +706,14 @@ export class Crawler {
|
|||
warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
||||
|
||||
// create WACZ
|
||||
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs, {stdio: "inherit"}));
|
||||
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs));
|
||||
|
||||
if (waczResult !== 0) {
|
||||
console.log("create result", waczResult);
|
||||
throw new Error("Unable to write WACZ successfully");
|
||||
this.logger.error("Error creating WACZ", {"status code": waczResult});
|
||||
this.logger.fatal("Unable to write WACZ successfully");
|
||||
}
|
||||
|
||||
this.debugLog(`WACZ successfully generated and saved to: ${waczPath}`);
|
||||
this.logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
|
||||
|
||||
// Verify WACZ
|
||||
/*
|
||||
|
@ -735,11 +721,11 @@ export class Crawler {
|
|||
validateArgs.push("-f");
|
||||
validateArgs.push(waczPath);
|
||||
|
||||
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs, {stdio: "inherit"}));
|
||||
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs));
|
||||
|
||||
if (waczVerifyResult !== 0) {
|
||||
console.log("validate", waczVerifyResult);
|
||||
throw new Error("Unable to verify WACZ created successfully");
|
||||
this.logger.fatal("Unable to verify WACZ created successfully");
|
||||
}
|
||||
*/
|
||||
if (this.storage) {
|
||||
|
@ -751,23 +737,43 @@ export class Crawler {
|
|||
}
|
||||
|
||||
awaitProcess(proc) {
|
||||
proc.stdout.on("data", (data) => {
|
||||
this.logger.info(data.toString());
|
||||
});
|
||||
|
||||
proc.stderr.on("data", (data) => {
|
||||
this.logger.error(data.toString());
|
||||
});
|
||||
return new Promise((resolve) => {
|
||||
proc.on("close", (code) => resolve(code));
|
||||
});
|
||||
}
|
||||
|
||||
async writeStats() {
|
||||
if (this.params.statsFilename) {
|
||||
const total = this.cluster.allTargetCount;
|
||||
const workersRunning = this.cluster.workersBusy.length;
|
||||
const numCrawled = total - (await this.crawlState.size()) - workersRunning;
|
||||
const limit = {max: this.params.limit || 0, hit: this.limitHit};
|
||||
const stats = {numCrawled, workersRunning, total, limit};
|
||||
async writeStats(toFile=false) {
|
||||
if (!this.params.logging.includes("stats")) {
|
||||
return;
|
||||
}
|
||||
|
||||
const realSize = await this.crawlState.realSize();
|
||||
const pending = await this.crawlState.numRealPending();
|
||||
const done = await this.crawlState.numDone();
|
||||
const total = realSize + pending + done;
|
||||
const limit = {max: this.params.limit || 0, hit: this.limitHit};
|
||||
const stats = {
|
||||
"crawled": done,
|
||||
"total": total,
|
||||
"pending": pending,
|
||||
"limit": limit,
|
||||
"pendingPages": Array.from(this.crawlState.pending.values()).map(x => JSON.stringify(x))
|
||||
};
|
||||
|
||||
this.logger.info("Crawl statistics", stats, "crawlState");
|
||||
|
||||
if (toFile && this.params.statsFilename) {
|
||||
try {
|
||||
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
||||
} catch (err) {
|
||||
console.warn("Stats output failed", err);
|
||||
this.logger.warn("Stats output failed", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -807,26 +813,24 @@ export class Crawler {
|
|||
// more serious page error, mark page session as invalid
|
||||
page.on("error", () => this.markPageFailed(page));
|
||||
|
||||
if (this.logConsole) {
|
||||
page.on("console", (msg) => {
|
||||
if (msg.type() === "error") {
|
||||
console.log(msg.text(), msg.location());
|
||||
}
|
||||
});
|
||||
}
|
||||
page.on("console", (msg) => {
|
||||
if (this.params.logging.includes("jserrors") && (msg.type() === "error")) {
|
||||
this.logger.warn(msg.text(), {"location": msg.location()}, "jsError");
|
||||
}
|
||||
});
|
||||
|
||||
const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded";
|
||||
|
||||
try {
|
||||
await page.goto(url, gotoOpts);
|
||||
if (this.errorCount > 0) {
|
||||
this.statusLog(`Page loaded, resetting error count ${this.errorCount} to 0`);
|
||||
this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`);
|
||||
this.errorCount = 0;
|
||||
}
|
||||
} catch (e) {
|
||||
let msg = e.message || "";
|
||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||
this.statusLog(`Load Error: ${url}: ${msg}`);
|
||||
this.logger.error(`Load Error: ${url}: ${msg}`);
|
||||
this.errorCount++;
|
||||
}
|
||||
}
|
||||
|
@ -875,7 +879,7 @@ export class Crawler {
|
|||
try {
|
||||
await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000});
|
||||
} catch (e) {
|
||||
console.log("note: waitForNetworkIdle timed out, ignoring");
|
||||
this.logger.info("note: waitForNetworkIdle timed out, ignoring");
|
||||
// ignore, continue
|
||||
}
|
||||
}
|
||||
|
@ -906,7 +910,7 @@ export class Crawler {
|
|||
}
|
||||
|
||||
} catch (e) {
|
||||
console.warn("Link Extraction failed", e);
|
||||
this.logger.warn("Link Extraction failed", e);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
@ -932,19 +936,18 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error("Queuing Error: ", e);
|
||||
this.logger.error("Queuing Error", e);
|
||||
}
|
||||
}
|
||||
|
||||
async checkCF(page) {
|
||||
try {
|
||||
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
|
||||
this.statusLog("Cloudflare Check Detected, waiting for reload...");
|
||||
this.logger.info("Cloudflare Check Detected, waiting for reload...");
|
||||
await this.sleep(5.5);
|
||||
}
|
||||
} catch (e) {
|
||||
//console.warn("Check CF failed, ignoring");
|
||||
//console.warn(e);
|
||||
this.logger.warn("Check CF failed, ignoring", e.message);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -987,17 +990,17 @@ export class Crawler {
|
|||
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
|
||||
if (this.params.text) {
|
||||
header["hasText"] = true;
|
||||
this.statusLog("Text Extraction: Enabled");
|
||||
this.logger.info("Text Extraction: Enabled");
|
||||
} else {
|
||||
header["hasText"] = false;
|
||||
this.statusLog("Text Extraction: Disabled");
|
||||
this.logger.info("Text Extraction: Disabled");
|
||||
}
|
||||
const header_formatted = JSON.stringify(header).concat("\n");
|
||||
await this.pagesFH.writeFile(header_formatted);
|
||||
}
|
||||
|
||||
} catch(err) {
|
||||
console.error("pages/pages.jsonl creation failed", err);
|
||||
this.logger.error("pages/pages.jsonl creation failed", err);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1017,7 +1020,7 @@ export class Crawler {
|
|||
try {
|
||||
await this.pagesFH.writeFile(processedRow);
|
||||
} catch (err) {
|
||||
console.warn("pages/pages.jsonl append failed", err);
|
||||
this.logger.warn("pages/pages.jsonl append failed", err);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1033,7 +1036,7 @@ export class Crawler {
|
|||
agent: this.resolveAgent
|
||||
});
|
||||
if (resp.status !== 200) {
|
||||
this.debugLog(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
||||
this.logger.debug(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1067,7 +1070,7 @@ export class Crawler {
|
|||
}
|
||||
|
||||
async awaitPendingClear() {
|
||||
this.statusLog("Waiting to ensure pending data is written to WARCs...");
|
||||
this.logger.info("Waiting to ensure pending data is written to WARCs...");
|
||||
|
||||
const redis = await initRedis("redis://localhost/0");
|
||||
|
||||
|
@ -1080,7 +1083,7 @@ export class Crawler {
|
|||
if (count <= 0) {
|
||||
break;
|
||||
}
|
||||
this.debugLog(`Still waiting for ${count} pending requests to finish...`);
|
||||
this.logger.debug(`Still waiting for ${count} pending requests to finish...`);
|
||||
} catch (e) {
|
||||
break;
|
||||
}
|
||||
|
@ -1104,17 +1107,17 @@ export class Crawler {
|
|||
const { sites } = await sitemapper.fetch();
|
||||
await this.queueInScopeUrls(seedId, sites, 0);
|
||||
} catch(e) {
|
||||
console.warn(e);
|
||||
this.logger.warn("Error fetching sites from sitemap", e);
|
||||
}
|
||||
}
|
||||
|
||||
async combineWARC() {
|
||||
this.statusLog("Generating Combined WARCs");
|
||||
this.logger.info("Generating Combined WARCs");
|
||||
|
||||
// Get the list of created Warcs
|
||||
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||
|
||||
this.debugLog(`Combining ${warcLists.length} WARCs...`);
|
||||
this.logger.debug(`Combining ${warcLists.length} WARCs...`);
|
||||
|
||||
const fileSizeObjects = []; // Used to sort the created warc by fileSize
|
||||
|
||||
|
@ -1181,7 +1184,7 @@ export class Crawler {
|
|||
fh.write(warcBuffer);
|
||||
}
|
||||
|
||||
this.debugLog(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
||||
this.logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
||||
|
||||
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
|
||||
|
||||
|
@ -1198,7 +1201,7 @@ export class Crawler {
|
|||
await fh.end();
|
||||
}
|
||||
|
||||
this.debugLog(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
||||
this.logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
||||
}
|
||||
|
||||
async serializeConfig(done = false) {
|
||||
|
@ -1248,10 +1251,10 @@ export class Crawler {
|
|||
}
|
||||
const res = yaml.dump(this.origConfig, {lineWidth: -1});
|
||||
try {
|
||||
this.statusLog("Saving crawl state to: " + filename);
|
||||
this.logger.info(`Saving crawl state to: ${filename}`);
|
||||
await fsp.writeFile(filename, res);
|
||||
} catch (e) {
|
||||
console.error(`Failed to write save state file: ${filename}`, e);
|
||||
this.logger.error(`Failed to write save state file: ${filename}`, e);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1259,11 +1262,11 @@ export class Crawler {
|
|||
|
||||
if (this.saveStateFiles.length > this.params.saveStateHistory) {
|
||||
const oldFilename = this.saveStateFiles.shift();
|
||||
this.statusLog(`Removing old save-state: ${oldFilename}`);
|
||||
this.logger.info(`Removing old save-state: ${oldFilename}`);
|
||||
try {
|
||||
await fsp.unlink(oldFilename);
|
||||
} catch (e) {
|
||||
console.error(`Failed to delete old save state file: ${oldFilename}`);
|
||||
this.logger.error(`Failed to delete old save state file: ${oldFilename}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
17
main.js
17
main.js
|
@ -1,6 +1,9 @@
|
|||
#!/usr/bin/env -S node --experimental-global-webcrypto
|
||||
|
||||
import { Crawler } from "./crawler.js";
|
||||
import { Logger } from "./util/logger.js";
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
var crawler = null;
|
||||
|
||||
|
@ -9,29 +12,29 @@ let forceTerm = false;
|
|||
|
||||
|
||||
async function handleTerminate(signame) {
|
||||
console.log(`${signame} received...`);
|
||||
logger.info(`${signame} received...`);
|
||||
if (!crawler || !crawler.crawlState) {
|
||||
console.log("error: no crawler running, exiting");
|
||||
logger.error("error: no crawler running, exiting");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (crawler.done) {
|
||||
console.log("success: crawler done, exiting");
|
||||
logger.info("success: crawler done, exiting");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
try {
|
||||
if (!crawler.crawlState.drainMax) {
|
||||
console.log("SIGNAL: gracefully finishing current pages...");
|
||||
logger.info("SIGNAL: gracefully finishing current pages...");
|
||||
crawler.gracefulFinish();
|
||||
|
||||
} else if (forceTerm || (Date.now() - lastSigInt) > 200) {
|
||||
console.log("SIGNAL: stopping crawl now...");
|
||||
logger.info("SIGNAL: stopping crawl now...");
|
||||
await crawler.serializeAndExit();
|
||||
}
|
||||
lastSigInt = Date.now();
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
logger.error("Error stopping crawl after receiving termination signal", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -40,7 +43,7 @@ process.on("SIGINT", () => handleTerminate("SIGINT"));
|
|||
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
|
||||
|
||||
process.on("SIGABRT", async () => {
|
||||
console.log("SIGABRT received, will force immediate exit on SIGTERM/SIGINT");
|
||||
logger.info("SIGABRT received, will force immediate exit on SIGTERM/SIGINT");
|
||||
forceTerm = true;
|
||||
});
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@ test("check yaml config file with seed list is used", async () => {
|
|||
test("check yaml config file will be overwritten by command line", async () => {
|
||||
try{
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
|
@ -61,7 +61,7 @@ test("check yaml config file will be overwritten by command line", async () => {
|
|||
}
|
||||
}
|
||||
|
||||
expect(pages.has("https://www.example.com/")).toBe(true);
|
||||
expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
|
||||
expect(pages.size).toBe(1);
|
||||
|
||||
});
|
||||
|
|
|
@ -8,7 +8,7 @@ const exec = util.promisify(execCallback);
|
|||
|
||||
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
|
@ -17,13 +17,13 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
|||
const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
|
||||
|
||||
const expectedPages = [
|
||||
"https://example.com/",
|
||||
"https://www.iana.org/domains/example",
|
||||
"http://www.iana.org/",
|
||||
"http://www.iana.org/domains",
|
||||
"http://www.iana.org/protocols",
|
||||
"http://www.iana.org/numbers",
|
||||
"http://www.iana.org/about",
|
||||
"https://webrecorder.net/",
|
||||
"https://webrecorder.net/blog",
|
||||
"https://webrecorder.net/tools",
|
||||
"https://webrecorder.net/community",
|
||||
"https://webrecorder.net/about",
|
||||
"https://webrecorder.net/contact",
|
||||
"https://webrecorder.net/faq",
|
||||
];
|
||||
|
||||
for (const page of crawled_pages.trim().split("\n")) {
|
||||
|
|
4
tests/fixtures/crawl-1.yaml
vendored
4
tests/fixtures/crawl-1.yaml
vendored
|
@ -1,8 +1,8 @@
|
|||
name: crawl-test-1
|
||||
collection: configtest
|
||||
seeds:
|
||||
- https://www.example.org
|
||||
- https://www.iana.org/
|
||||
- https://webrecorder.net/
|
||||
- https://specs.webrecorder.net/
|
||||
|
||||
generateWACZ: true
|
||||
|
||||
|
|
4
tests/fixtures/urlSeedFile.txt
vendored
4
tests/fixtures/urlSeedFile.txt
vendored
|
@ -1,2 +1,2 @@
|
|||
https://www.example.org
|
||||
https://www.example.com
|
||||
https://webrecorder.net/about/
|
||||
https://specs.webrecorder.net/wacz/1.1.1/
|
||||
|
|
|
@ -4,10 +4,10 @@ import fs from "fs";
|
|||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
test("check that URLs one-depth out from the seed-list are crawled", async () => {
|
||||
test("check that URLs in seed-list are crawled", async () => {
|
||||
try {
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
|
|
|
@ -13,6 +13,9 @@ import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
|
|||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
import { screenshotTypes } from "./screenshots.js";
|
||||
import { Logger } from "./logger.js";
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
|
||||
// ============================================================================
|
||||
|
@ -171,7 +174,7 @@ class ArgParser {
|
|||
},
|
||||
|
||||
"logging": {
|
||||
describe: "Logging options for crawler, can include: stats, pywb, behaviors, behaviors-debug, jserrors",
|
||||
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
|
||||
type: "string",
|
||||
default: "stats",
|
||||
},
|
||||
|
@ -351,7 +354,7 @@ class ArgParser {
|
|||
|
||||
// Check that the collection name is valid.
|
||||
if (argv.collection.search(/^[\w][\w-]*$/) === -1){
|
||||
throw new Error(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
|
||||
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
|
||||
}
|
||||
|
||||
argv.timeout *= 1000;
|
||||
|
@ -365,7 +368,7 @@ class ArgParser {
|
|||
|
||||
for (const opt of argv.waitUntil) {
|
||||
if (!WAIT_UNTIL_OPTS.includes(opt)) {
|
||||
throw new Error("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
||||
logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -394,30 +397,25 @@ class ArgParser {
|
|||
if (argv.behaviorTimeout) {
|
||||
behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
|
||||
}
|
||||
if (argv.logging.includes("behaviors")) {
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
} else if (argv.logging.includes("behaviors-debug")) {
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
argv.behaviorsLogDebug = true;
|
||||
}
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||
|
||||
if (argv.newContext) {
|
||||
console.log("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored");
|
||||
logger.info("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored");
|
||||
}
|
||||
|
||||
if (argv.workers > 1) {
|
||||
console.log("Window context being used to support >1 workers");
|
||||
logger.info("Window context being used to support >1 workers");
|
||||
argv.newContext = ReuseWindowConcurrency;
|
||||
} else {
|
||||
console.log("Page context being used with 1 worker");
|
||||
logger.info("Page context being used with 1 worker");
|
||||
argv.newContext = Cluster.CONCURRENCY_PAGE;
|
||||
}
|
||||
|
||||
if (argv.mobileDevice) {
|
||||
argv.emulateDevice = puppeteer.devices[argv.mobileDevice];
|
||||
if (!argv.emulateDevice) {
|
||||
throw new Error("Unknown device: " + argv.mobileDevice);
|
||||
logger.fatal("Unknown device: " + argv.mobileDevice);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -442,13 +440,13 @@ class ArgParser {
|
|||
} else {
|
||||
argv.netIdleWait = 2;
|
||||
}
|
||||
console.log(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
||||
logger.info(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
||||
}
|
||||
|
||||
// prefer argv.include only if string or a non-empty array
|
||||
if (argv.include && (typeof(argv.include) === "string" || argv.include.length)) {
|
||||
if (argv.scopeType && argv.scopeType !== "custom") {
|
||||
console.warn("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");
|
||||
logger.info("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");
|
||||
argv.scopeType = "custom";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
import fs from "fs";
|
||||
|
||||
import { Logger } from "./logger.js";
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
const RULE_TYPES = ["block", "allowOnly"];
|
||||
|
||||
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
|
||||
|
@ -28,7 +32,7 @@ class BlockRule
|
|||
}
|
||||
|
||||
if (!RULE_TYPES.includes(this.type)) {
|
||||
throw new Error("Rule \"type\" must be: " + RULE_TYPES.join(", "));
|
||||
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -47,11 +51,11 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
|||
// ===========================================================================
|
||||
export class BlockRules
|
||||
{
|
||||
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
|
||||
constructor(blockRules, blockPutUrl, blockErrMsg, logger) {
|
||||
this.rules = [];
|
||||
this.blockPutUrl = blockPutUrl;
|
||||
this.blockErrMsg = blockErrMsg;
|
||||
this.debugLog = debugLog;
|
||||
this.logger = logger;
|
||||
|
||||
this.blockedUrlSet = new Set();
|
||||
|
||||
|
@ -60,9 +64,9 @@ export class BlockRules
|
|||
}
|
||||
|
||||
if (this.rules.length) {
|
||||
this.debugLog("URL Block Rules:\n");
|
||||
this.logger.debug("URL Block Rules:\n");
|
||||
for (const rule of this.rules) {
|
||||
this.debugLog(rule.toString());
|
||||
this.logger.debug(rule.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -84,7 +88,7 @@ export class BlockRules
|
|||
try {
|
||||
await this.handleRequest(request);
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
this.logger.warn("Error handling request", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -104,7 +108,7 @@ export class BlockRules
|
|||
}
|
||||
|
||||
} catch (e) {
|
||||
this.debugLog(`Block: (${blockState}) Failed On: ${url} Reason: ${e}`);
|
||||
this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -151,10 +155,10 @@ export class BlockRules
|
|||
|
||||
if (block) {
|
||||
if (blockState === BlockState.BLOCK_PAGE_NAV) {
|
||||
console.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`);
|
||||
this.logger.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`);
|
||||
return BlockState.ALLOW;
|
||||
}
|
||||
this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
|
||||
this.logger.debug(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
|
||||
await this.recordBlockMsg(url);
|
||||
return blockState;
|
||||
}
|
||||
|
@ -187,7 +191,7 @@ export class BlockRules
|
|||
}
|
||||
|
||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
|
||||
this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
|
||||
this.logger.debug(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
|
||||
return {block, done: true};
|
||||
}
|
||||
|
||||
|
@ -204,7 +208,7 @@ export class BlockRules
|
|||
return !!text.match(frameTextMatch);
|
||||
|
||||
} catch (e) {
|
||||
this.debugLog(e);
|
||||
this.logger.debug("Error determining text match", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -230,8 +234,8 @@ export class BlockRules
|
|||
// ===========================================================================
|
||||
export class AdBlockRules extends BlockRules
|
||||
{
|
||||
constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") {
|
||||
super([], blockPutUrl, blockErrMsg, debugLog);
|
||||
constructor(blockPutUrl, blockErrMsg, logger, adhostsFilePath = "../ad-hosts.json") {
|
||||
super([], blockPutUrl, blockErrMsg, logger);
|
||||
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
|
||||
}
|
||||
|
||||
|
@ -248,7 +252,7 @@ export class AdBlockRules extends BlockRules
|
|||
try {
|
||||
await this.handleRequest(request);
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
this.logger.warn("Error handling request", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -257,7 +261,7 @@ export class AdBlockRules extends BlockRules
|
|||
const fragments = url.split("/");
|
||||
const domain = fragments.length > 2 ? fragments[2] : null;
|
||||
if (this.adhosts.includes(domain)) {
|
||||
this.debugLog(`URL blocked for being an ad: ${url}`);
|
||||
this.logger.debug(`URL blocked for being an ad: ${url}`);
|
||||
await this.recordBlockMsg(url);
|
||||
return BlockState.BLOCK_AD;
|
||||
}
|
||||
|
|
|
@ -4,6 +4,9 @@ import path from "path";
|
|||
import os from "os";
|
||||
import request from "request";
|
||||
import { initStorage } from "./storage.js";
|
||||
import { Logger } from "./logger.js";
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
|
||||
|
@ -13,7 +16,7 @@ export async function loadProfile(profileFilename) {
|
|||
if (profileFilename &&
|
||||
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
|
||||
|
||||
console.log(`Downloading ${profileFilename} to ${targetFilename}`);
|
||||
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
|
||||
|
||||
const p = new Promise((resolve, reject) => {
|
||||
request.get(profileFilename).
|
||||
|
@ -29,7 +32,7 @@ export async function loadProfile(profileFilename) {
|
|||
const storage = initStorage("");
|
||||
|
||||
if (!storage) {
|
||||
throw new Error("Profile specified relative to s3 storage, but no S3 storage defined");
|
||||
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
||||
}
|
||||
|
||||
await storage.downloadFile(profileFilename.slice(1), targetFilename);
|
||||
|
@ -41,7 +44,7 @@ export async function loadProfile(profileFilename) {
|
|||
try {
|
||||
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
|
||||
} catch (e) {
|
||||
console.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -72,7 +75,7 @@ export function getDefaultUA() {
|
|||
version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"});
|
||||
version = version.match(/[\d.]+/)[0];
|
||||
} catch(e) {
|
||||
console.error(e);
|
||||
logger.error("Error getting default UserAgent", e);
|
||||
}
|
||||
|
||||
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||
|
@ -158,7 +161,7 @@ export async function evaluateWithCLI(frame, funcString) {
|
|||
});
|
||||
|
||||
if (exceptionDetails) {
|
||||
throw new Error(
|
||||
logger.fatal(
|
||||
"Behavior Evaluation Failed" + exceptionDetails.text
|
||||
);
|
||||
}
|
||||
|
|
45
util/logger.js
Normal file
45
util/logger.js
Normal file
|
@ -0,0 +1,45 @@
|
|||
// ===========================================================================
|
||||
|
||||
export class Logger
|
||||
{
|
||||
constructor(debugLogging=false) {
|
||||
this.debugLogging = debugLogging;
|
||||
}
|
||||
|
||||
logAsJSON(message, data, context, logLevel="info") {
|
||||
if (typeof data !== "object") {
|
||||
data = {"message": data.toString()};
|
||||
}
|
||||
let dataToLog = {
|
||||
"logLevel": logLevel,
|
||||
"timestamp": new Date().toISOString(),
|
||||
"context": context,
|
||||
"message": message,
|
||||
"details": data ? data : {}
|
||||
};
|
||||
console.log(JSON.stringify(dataToLog));
|
||||
}
|
||||
|
||||
info(message, data={}, context="general") {
|
||||
this.logAsJSON(message, data, context);
|
||||
}
|
||||
|
||||
error(message, data={}, context="general") {
|
||||
this.logAsJSON(message, data, context, "error");
|
||||
}
|
||||
|
||||
warn(message, data={}, context="general") {
|
||||
this.logAsJSON(message, data, context, "warn");
|
||||
}
|
||||
|
||||
debug(message, data={}, context="general") {
|
||||
if (this.debugLogging) {
|
||||
this.logAsJSON(message, data, context, "debug");
|
||||
}
|
||||
}
|
||||
|
||||
fatal(message, data={}, context="general", exitCode=1) {
|
||||
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
|
||||
process.exit(exitCode);
|
||||
}
|
||||
}
|
|
@ -2,6 +2,9 @@ import fs from "fs";
|
|||
import path from "path";
|
||||
import * as warcio from "warcio";
|
||||
|
||||
import { Logger } from "./logger.js";
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
// ============================================================================
|
||||
|
||||
|
@ -43,9 +46,9 @@ export class Screenshots {
|
|||
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, options.type);
|
||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
||||
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
||||
console.log(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
||||
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
||||
} catch (e) {
|
||||
console.log(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e);
|
||||
logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,15 @@
|
|||
import { Logger } from "./logger.js";
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
|
||||
export class ScopedSeed
|
||||
{
|
||||
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
|
||||
const parsedUrl = this.parseUrl(url);
|
||||
if (!parsedUrl) {
|
||||
logger.fatal(`Invalid Seed "${url}" - not a valid URL`);
|
||||
}
|
||||
this.url = parsedUrl.href;
|
||||
this.include = this.parseRx(include);
|
||||
this.exclude = this.parseRx(exclude);
|
||||
|
@ -36,11 +44,12 @@ export class ScopedSeed
|
|||
try {
|
||||
parsedUrl = new URL(url.trim());
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid Seed "${url}" - not a valid URL`);
|
||||
logger.error(`Invalid Seed "${url}" - not a valid URL`);
|
||||
}
|
||||
|
||||
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
|
||||
throw new Error(`Invalid Seed "${url}" - URL must start with http:// or https://`);
|
||||
logger.error(`Invalid Seed "${url}" - URL must start with http:// or https://`);
|
||||
parsedUrl = null;
|
||||
}
|
||||
|
||||
return parsedUrl;
|
||||
|
@ -91,7 +100,7 @@ export class ScopedSeed
|
|||
break;
|
||||
|
||||
default:
|
||||
throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
|
||||
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
|
||||
}
|
||||
|
||||
return [include, allowHash];
|
||||
|
@ -106,9 +115,8 @@ export class ScopedSeed
|
|||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
url = this.parseUrl(url);
|
||||
} catch(e) {
|
||||
url = this.parseUrl(url);
|
||||
if (!url) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
import mod from "puppeteer-cluster/dist/Job.js";
|
||||
|
||||
import { Logger } from "./logger.js";
|
||||
|
||||
const Job = mod.default;
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class BaseState
|
||||
|
@ -102,7 +107,7 @@ export class MemoryCrawlState extends BaseState
|
|||
},
|
||||
|
||||
reject(e) {
|
||||
console.warn(`Page Load Failed: ${url}, Reason: ${e}`);
|
||||
logger.warn(`Page Load Failed: ${url}`, e.message);
|
||||
|
||||
state.pending.delete(url);
|
||||
|
||||
|
@ -316,7 +321,7 @@ return 0;
|
|||
try {
|
||||
data = JSON.parse(json);
|
||||
} catch(e) {
|
||||
console.error("Invalid queued json: ", json);
|
||||
logger.error("Invalid queued json", json);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -338,7 +343,7 @@ return 0;
|
|||
},
|
||||
|
||||
async reject(e) {
|
||||
console.warn(`Page Load Failed: ${url}, Reason: ${e}`);
|
||||
logger.warn(`Page Load Failed: ${url}`, e.message);
|
||||
await state._fail(url);
|
||||
}
|
||||
};
|
||||
|
@ -445,7 +450,7 @@ return 0;
|
|||
|
||||
for (const url of pendingUrls) {
|
||||
if (await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url)) {
|
||||
console.log("Requeued: " + url);
|
||||
logger.info(`Requeued: ${url}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,9 +7,12 @@ import { createHash } from "crypto";
|
|||
import Minio from "minio";
|
||||
|
||||
import { initRedis } from "./redis.js";
|
||||
import { Logger } from "./logger.js";
|
||||
|
||||
import getFolderSize from "get-folder-size";
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class S3StorageSync
|
||||
|
@ -57,10 +60,13 @@ export class S3StorageSync
|
|||
}
|
||||
|
||||
async uploadFile(srcFilename, targetFilename) {
|
||||
console.log(`Bucket: ${this.bucketName}`);
|
||||
console.log(`Crawl Id: ${this.crawlId}`);
|
||||
console.log(`Prefix: ${this.objectPrefix}`);
|
||||
console.log(`Target Filename: ${targetFilename}`);
|
||||
const fileUploadInfo = {
|
||||
"bucket": this.bucketName,
|
||||
"crawlId": this.crawlId,
|
||||
"prefix": this.objectPrefix,
|
||||
"targetFilename": this.targetFilename
|
||||
};
|
||||
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
|
||||
|
||||
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
|
||||
|
||||
|
@ -76,7 +82,7 @@ export class S3StorageSync
|
|||
|
||||
async uploadCollWACZ(srcFilename, targetFilename, completed = true) {
|
||||
const resource = await this.uploadFile(srcFilename, targetFilename);
|
||||
console.log(resource);
|
||||
logger.info("WACZ S3 file upload resource", resource, "s3Upload");
|
||||
|
||||
if (this.webhookUrl) {
|
||||
const body = {
|
||||
|
@ -92,14 +98,14 @@ export class S3StorageSync
|
|||
completed
|
||||
};
|
||||
|
||||
console.log("Pinging Webhook: " + this.webhookUrl);
|
||||
logger.info(`Pinging Webhook: ${this.webhookUrl}`);
|
||||
|
||||
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) {
|
||||
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)});
|
||||
} else if (this.webhookUrl.startsWith("redis://")) {
|
||||
const parts = this.webhookUrl.split("/");
|
||||
if (parts.length !== 5) {
|
||||
throw new Error("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
|
||||
logger.fatal("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
|
||||
}
|
||||
const redis = await initRedis(parts.slice(0, 4).join("/"));
|
||||
await redis.rpush(parts[4], JSON.stringify(body));
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
import sbi from "puppeteer-cluster/dist/concurrency/SingleBrowserImplementation.js";
|
||||
|
||||
import { Logger } from "./logger.js";
|
||||
|
||||
const logger = new Logger();
|
||||
|
||||
const SingleBrowserImplementation = sbi.default;
|
||||
|
||||
|
||||
|
@ -40,7 +44,7 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
|
|||
}
|
||||
|
||||
this.repairing = true;
|
||||
console.debug("Starting repair");
|
||||
logger.warn("Starting browser repair");
|
||||
|
||||
if (this.screencaster) {
|
||||
this.screencaster.endAllTargets();
|
||||
|
@ -50,13 +54,13 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
|
|||
// will probably fail, but just in case the repair was not necessary
|
||||
await this.browser.close();
|
||||
} catch (e) {
|
||||
console.debug("Unable to close browser.");
|
||||
logger.warn("Unable to close browser");
|
||||
}
|
||||
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.debug("Unable to restart chrome.");
|
||||
logger.warn("Unable to restart chrome");
|
||||
}
|
||||
this.repairRequested = false;
|
||||
this.repairing = false;
|
||||
|
@ -71,7 +75,7 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
|
|||
const res = await this.cdp.send("Target.createTarget", {url: this.startPage, newWindow: true});
|
||||
targetId = res.targetId;
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
logger.warn("Error getting new page in window context", e);
|
||||
await this.repair();
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue