mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Logging and Behavior Tweaks (#229)
- Ensure page is included in all logging details - Update logging messages to be a single string, with variables added in the details - Always wait for all pending wait requests to finish (unless counter <0) - Don't set puppeteer-cluster timeout (prep for removing puppeeteer-cluster) - Add behaviorTimeout to running behaviors in crawler, in addition to in behaviors themselves. - Add logging for behavior start, finish and timeout - Move writeStats() logging to beginning of each page as well as at the end, to avoid confusion about pending pages. - For events from frames, use frameUrl along with current page - deps: bump browsertrix-behaviors to 0.4.2 - version: bump to 0.8.1
This commit is contained in:
parent
a4358f4622
commit
5da379cb5f
6 changed files with 116 additions and 82 deletions
121
crawler.js
121
crawler.js
|
@ -166,7 +166,7 @@ export class Crawler {
|
|||
|
||||
this.logger.info(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
|
||||
|
||||
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname());
|
||||
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
|
||||
|
||||
} else {
|
||||
this.logger.info("Storing state in memory", {}, "state");
|
||||
|
@ -324,21 +324,35 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
_behaviorLog({data, type}) {
|
||||
_behaviorLog({data, type}, pageUrl) {
|
||||
let behaviorLine;
|
||||
let message;
|
||||
let details;
|
||||
|
||||
if (typeof(data) === "string") {
|
||||
message = data;
|
||||
details = {};
|
||||
} else {
|
||||
message = type === "info" ? "Behavior log" : "Behavior debug";
|
||||
details = typeof(data) === "object" ? data : {};
|
||||
}
|
||||
|
||||
if (pageUrl) {
|
||||
details.page = pageUrl;
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case "info":
|
||||
behaviorLine = JSON.stringify(data);
|
||||
if (behaviorLine != this._behaviorLastLine) {
|
||||
this.logger.info("Behavior log", data, "behavior");
|
||||
this.logger.info(message, details, "behaviorScript");
|
||||
this._behaviorLastLine = behaviorLine;
|
||||
}
|
||||
break;
|
||||
|
||||
case "debug":
|
||||
default:
|
||||
this.logger.debug("Behavior debug", data, "behavior");
|
||||
this.logger.debug(message, details, "behaviorScript");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -349,7 +363,12 @@ export class Crawler {
|
|||
}
|
||||
|
||||
async crawlPage(opts) {
|
||||
await this.writeStats();
|
||||
|
||||
const {page, data} = opts;
|
||||
const {url} = data;
|
||||
|
||||
const logDetails = {page: url};
|
||||
|
||||
if (!this.isInScope(data)) {
|
||||
this.logger.info("Page no longer in scope", data);
|
||||
|
@ -358,7 +377,7 @@ export class Crawler {
|
|||
|
||||
try {
|
||||
if (this.screencaster) {
|
||||
await this.screencaster.screencastTarget(page.target(), data.url);
|
||||
await this.screencaster.screencastTarget(page.target(), url);
|
||||
}
|
||||
|
||||
if (this.emulateDevice) {
|
||||
|
@ -372,7 +391,7 @@ export class Crawler {
|
|||
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||
|
||||
if (this.params.behaviorOpts && !page.__bx_inited) {
|
||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, url));
|
||||
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
||||
page.__bx_inited = true;
|
||||
}
|
||||
|
@ -384,10 +403,10 @@ export class Crawler {
|
|||
|
||||
if (this.params.screenshot) {
|
||||
if (!page.isHTMLPage) {
|
||||
this.logger.info("Skipping screenshots for non-HTML page");
|
||||
this.logger.info("Skipping screenshots for non-HTML page", logDetails);
|
||||
}
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
const screenshots = new Screenshots({page, url: data.url, directory: archiveDir});
|
||||
const screenshots = new Screenshots({page, url, directory: archiveDir});
|
||||
if (this.params.screenshot.includes("view")) {
|
||||
await screenshots.take();
|
||||
}
|
||||
|
@ -410,45 +429,58 @@ export class Crawler {
|
|||
|
||||
if (this.params.behaviorOpts) {
|
||||
if (!page.isHTMLPage) {
|
||||
this.logger.info("Skipping behaviors for non-HTML page", data, "behavior");
|
||||
this.logger.info("Skipping behaviors for non-HTML page", logDetails, "behavior");
|
||||
} else {
|
||||
await Promise.allSettled(
|
||||
page.frames().
|
||||
filter(frame => this.shouldRunBehavior(frame)).
|
||||
map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();"))
|
||||
);
|
||||
|
||||
// also wait for general net idle
|
||||
await this.netIdle(page);
|
||||
const behaviorTimeout = this.params.behaviorTimeout / 1000;
|
||||
this.logger.info("Behaviors started", {behaviorTimeout, ...logDetails}, "behavior");
|
||||
const res = await Promise.race([
|
||||
this.sleep(behaviorTimeout),
|
||||
Promise.allSettled(
|
||||
page.frames().
|
||||
filter(frame => this.shouldRunBehavior(frame, logDetails)).
|
||||
map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();", logDetails, "behavior"))
|
||||
)
|
||||
]);
|
||||
if (res && res.length) {
|
||||
this.logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
|
||||
} else {
|
||||
this.logger.error("Behaviors timed out", logDetails, "behavior");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.info("Page graph data for successfully crawled page", data, "pageGraph");
|
||||
|
||||
await this.writeStats();
|
||||
this.logger.info("Page finished", logDetails, "pageStatus");
|
||||
|
||||
await this.checkLimits();
|
||||
|
||||
await this.serializeConfig();
|
||||
|
||||
} catch (e) {
|
||||
this.logger.error(`Error crawling page ${data.url}`, e);
|
||||
this.logger.error("Page Errored", {...e, ...logDetails}, "pageStatus");
|
||||
await this.markPageFailed(page);
|
||||
}
|
||||
}
|
||||
|
||||
async shouldRunBehavior(frame) {
|
||||
async shouldRunBehavior(frame, logDetails) {
|
||||
if (!frame.parentFrame()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const url = frame.url();
|
||||
const frameUrl = frame.url();
|
||||
|
||||
if (url === "about:blank") {
|
||||
return false;
|
||||
let res;
|
||||
|
||||
if (frameUrl === "about:blank") {
|
||||
res = false;
|
||||
} else {
|
||||
res = !(await this.adBlockRules.shouldBlock(null, frameUrl, logDetails));
|
||||
}
|
||||
|
||||
return !!(await this.adBlockRules.shouldBlock(null, url));
|
||||
if (!res) {
|
||||
this.logger.info("Skipping behavior for frame", {frameUrl, ...logDetails}, "behavior");
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
async createWARCInfo(filename) {
|
||||
|
@ -591,7 +623,8 @@ export class Crawler {
|
|||
concurrency: this.params.newContext,
|
||||
maxConcurrency: this.params.workers,
|
||||
skipDuplicateUrls: false,
|
||||
timeout: this.params.timeout * 2,
|
||||
// effectively disable
|
||||
timeout: 1e8,
|
||||
puppeteerOptions: this.puppeteerArgs,
|
||||
puppeteer,
|
||||
monitor: false
|
||||
|
@ -645,6 +678,8 @@ export class Crawler {
|
|||
await this.pagesFH.close();
|
||||
}
|
||||
|
||||
await this.writeStats(true);
|
||||
|
||||
// extra wait for all resources to land into WARCs
|
||||
await this.awaitPendingClear();
|
||||
|
||||
|
@ -657,8 +692,6 @@ export class Crawler {
|
|||
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
}
|
||||
|
||||
await this.writeStats(true);
|
||||
|
||||
if (this.params.generateWACZ && (!this.interrupted || this.finalExit || this.clearOnExit)) {
|
||||
await this.generateWACZ();
|
||||
|
||||
|
@ -783,7 +816,7 @@ export class Crawler {
|
|||
"pendingPages": pendingList.map(x => JSON.stringify(x))
|
||||
};
|
||||
|
||||
this.logger.info("Crawl statistics", stats, "crawlState");
|
||||
this.logger.info("Crawl statistics", stats, "crawlStatus");
|
||||
|
||||
if (toFile && this.params.statsFilename) {
|
||||
try {
|
||||
|
@ -797,6 +830,8 @@ export class Crawler {
|
|||
async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
|
||||
const {url, seedId, depth, extraHops = 0} = urlData;
|
||||
|
||||
const logDetails = {page: url};
|
||||
|
||||
let isHTMLPage = true;
|
||||
|
||||
if (!await this.isHTML(url)) {
|
||||
|
@ -840,13 +875,14 @@ export class Crawler {
|
|||
try {
|
||||
await page.goto(url, gotoOpts);
|
||||
if (this.errorCount > 0) {
|
||||
this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`);
|
||||
this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, logDetails);
|
||||
this.errorCount = 0;
|
||||
}
|
||||
} catch (e) {
|
||||
let msg = e.message || "";
|
||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||
this.logger.error(`Load Error: ${url}: ${msg}`);
|
||||
const mainMessage = e.name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error";
|
||||
this.logger.error(mainMessage, {msg, ...logDetails});
|
||||
this.errorCount++;
|
||||
}
|
||||
}
|
||||
|
@ -859,11 +895,9 @@ export class Crawler {
|
|||
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
||||
await this.checkCF(page);
|
||||
await this.checkCF(page, logDetails);
|
||||
|
||||
await this.netIdle(page);
|
||||
|
||||
await this.sleep(5);
|
||||
await this.netIdle(page, logDetails);
|
||||
|
||||
// skip extraction if at max depth
|
||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||
|
@ -884,7 +918,7 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async netIdle(page) {
|
||||
async netIdle(page, details) {
|
||||
if (!this.params.netIdleWait) {
|
||||
return;
|
||||
}
|
||||
|
@ -895,7 +929,7 @@ export class Crawler {
|
|||
try {
|
||||
await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000});
|
||||
} catch (e) {
|
||||
this.logger.info("note: waitForNetworkIdle timed out, ignoring");
|
||||
this.logger.info("waitForNetworkIdle timed out, ignoring", details);
|
||||
// ignore, continue
|
||||
}
|
||||
}
|
||||
|
@ -956,14 +990,14 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async checkCF(page) {
|
||||
async checkCF(page, logDetails) {
|
||||
try {
|
||||
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
|
||||
this.logger.info("Cloudflare Check Detected, waiting for reload...");
|
||||
this.logger.info("Cloudflare Check Detected, waiting for reload...", logDetails);
|
||||
await this.sleep(5.5);
|
||||
}
|
||||
} catch (e) {
|
||||
this.logger.warn("Check CF failed, ignoring", e);
|
||||
//this.logger.warn("Check CF failed, ignoring");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1090,10 +1124,7 @@ export class Crawler {
|
|||
|
||||
const redis = await initRedis("redis://localhost/0");
|
||||
|
||||
// wait for pending requests upto 120 secs, unless canceling
|
||||
const MAX_WAIT = 120;
|
||||
|
||||
for (let i = 0; i < MAX_WAIT && !this.interrupted; i++) {
|
||||
while (!this.interrupted) {
|
||||
try {
|
||||
const count = Number(await redis.get(`pywb:${this.params.collection}:pending`) || 0);
|
||||
if (count <= 0) {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.8.0-beta.2",
|
||||
"version": "0.8.1",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
@ -13,7 +13,7 @@
|
|||
"dependencies": {
|
||||
"@novnc/novnc": "1.4.0-beta",
|
||||
"abort-controller": "^3.0.0",
|
||||
"browsertrix-behaviors": "^0.4.1",
|
||||
"browsertrix-behaviors": "^0.4.2",
|
||||
"get-folder-size": "^4.0.0",
|
||||
"ioredis": "^4.27.1",
|
||||
"js-yaml": "^4.1.0",
|
||||
|
|
|
@ -64,9 +64,9 @@ export class BlockRules
|
|||
}
|
||||
|
||||
if (this.rules.length) {
|
||||
this.logger.debug("URL Block Rules:\n");
|
||||
this.logger.debug("URL Block Rules:\n", {}, "blocking");
|
||||
for (const rule of this.rules) {
|
||||
this.logger.debug(rule.toString());
|
||||
this.logger.debug(rule.toString(), {}, "blocking");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -85,21 +85,22 @@ export class BlockRules
|
|||
await page.setRequestInterception(true);
|
||||
|
||||
page.on("request", async (request) => {
|
||||
const logDetails = {page: page.url()};
|
||||
try {
|
||||
await this.handleRequest(request);
|
||||
await this.handleRequest(request, logDetails);
|
||||
} catch (e) {
|
||||
this.logger.warn("Error handling request", e);
|
||||
this.logger.warn("Error handling request", {...e, ...logDetails}, "blocking");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async handleRequest(request) {
|
||||
async handleRequest(request, logDetails) {
|
||||
const url = request.url();
|
||||
|
||||
let blockState;
|
||||
|
||||
try {
|
||||
blockState = await this.shouldBlock(request, url);
|
||||
blockState = await this.shouldBlock(request, url, logDetails);
|
||||
|
||||
if (blockState === BlockState.ALLOW) {
|
||||
await request.continue();
|
||||
|
@ -108,11 +109,11 @@ export class BlockRules
|
|||
}
|
||||
|
||||
} catch (e) {
|
||||
this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, e);
|
||||
this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...e, ...logDetails}, "blocking");
|
||||
}
|
||||
}
|
||||
|
||||
async shouldBlock(request, url) {
|
||||
async shouldBlock(request, url, logDetails) {
|
||||
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
||||
return BlockState.ALLOW;
|
||||
}
|
||||
|
@ -151,14 +152,14 @@ export class BlockRules
|
|||
}
|
||||
|
||||
for (const rule of this.rules) {
|
||||
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq);
|
||||
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq, logDetails);
|
||||
|
||||
if (block) {
|
||||
if (blockState === BlockState.BLOCK_PAGE_NAV) {
|
||||
this.logger.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`);
|
||||
this.logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking");
|
||||
return BlockState.ALLOW;
|
||||
}
|
||||
this.logger.debug(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
|
||||
this.logger.debug("URL Blocked in iframe", {url, frameUrl, ...logDetails}, "blocking");
|
||||
await this.recordBlockMsg(url);
|
||||
return blockState;
|
||||
}
|
||||
|
@ -170,7 +171,7 @@ export class BlockRules
|
|||
return BlockState.ALLOW;
|
||||
}
|
||||
|
||||
async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq) {
|
||||
async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq, logDetails) {
|
||||
const {url, inFrameUrl, frameTextMatch} = rule;
|
||||
|
||||
const type = rule.type || "block";
|
||||
|
@ -190,8 +191,8 @@ export class BlockRules
|
|||
return {block: false, done: false};
|
||||
}
|
||||
|
||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
|
||||
this.logger.debug(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
|
||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly;
|
||||
this.logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking");
|
||||
return {block, done: true};
|
||||
}
|
||||
|
||||
|
@ -200,7 +201,7 @@ export class BlockRules
|
|||
return {block, done: false};
|
||||
}
|
||||
|
||||
async isTextMatch(request, reqUrl, frameTextMatch) {
|
||||
async isTextMatch(request, reqUrl, frameTextMatch, logDetails) {
|
||||
try {
|
||||
const res = await fetch(reqUrl);
|
||||
const text = await res.text();
|
||||
|
@ -208,7 +209,7 @@ export class BlockRules
|
|||
return !!text.match(frameTextMatch);
|
||||
|
||||
} catch (e) {
|
||||
this.logger.debug("Error determining text match", e);
|
||||
this.logger.debug("Error determining text match", {...e, ...logDetails}, "blocking");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -249,19 +250,21 @@ export class AdBlockRules extends BlockRules
|
|||
await page.setRequestInterception(true);
|
||||
|
||||
page.on("request", async (request) => {
|
||||
const logDetails = {page: page.url()};
|
||||
|
||||
try {
|
||||
await this.handleRequest(request);
|
||||
await this.handleRequest(request, logDetails);
|
||||
} catch (e) {
|
||||
this.logger.warn("Error handling request", e);
|
||||
this.logger.warn("Error handling request", {...e, ...logDetails}, "blocking");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async shouldBlock(request, url) {
|
||||
async shouldBlock(request, url, logDetails) {
|
||||
const fragments = url.split("/");
|
||||
const domain = fragments.length > 2 ? fragments[2] : null;
|
||||
if (this.adhosts.includes(domain)) {
|
||||
this.logger.debug(`URL blocked for being an ad: ${url}`);
|
||||
this.logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
|
||||
await this.recordBlockMsg(url);
|
||||
return BlockState.BLOCK_AD;
|
||||
}
|
||||
|
|
|
@ -143,11 +143,12 @@ export function chromeArgs (proxy, userAgent=null, extraArgs=[]) {
|
|||
}
|
||||
|
||||
|
||||
export async function evaluateWithCLI(frame, funcString, name = "behaviors") {
|
||||
export async function evaluateWithCLI(frame, funcString, logData, contextName) {
|
||||
const context = await frame.executionContext();
|
||||
const url = frame.url();
|
||||
|
||||
logger.info(`Running ${name}...`, {url});
|
||||
let details = {frameUrl: frame.url(), ...logData};
|
||||
|
||||
logger.info("Run Script Started", details, contextName);
|
||||
|
||||
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
|
||||
const contextId = context._contextId;
|
||||
|
@ -164,13 +165,12 @@ export async function evaluateWithCLI(frame, funcString, name = "behaviors") {
|
|||
});
|
||||
|
||||
if (exceptionDetails) {
|
||||
const details = exceptionDetails.stackTrace || {};
|
||||
details.url = url;
|
||||
logger.error(
|
||||
`Run ${name} failed: ${exceptionDetails.text}`, details
|
||||
);
|
||||
if (exceptionDetails.stackTrace) {
|
||||
details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details};
|
||||
}
|
||||
logger.error("Run Script Failed", details, contextName);
|
||||
} else {
|
||||
logger.info(`Run ${name} finished`, {url});
|
||||
logger.info("Run Script Finished", details, contextName);
|
||||
}
|
||||
|
||||
return result.value;
|
||||
|
|
|
@ -326,11 +326,11 @@ return 0;
|
|||
data = JSON.parse(json);
|
||||
} catch(e) {
|
||||
logger.error("Invalid queued json", json);
|
||||
return null;
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return null;
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const url = data.url;
|
||||
|
|
|
@ -1148,10 +1148,10 @@ browserslist@^4.21.3:
|
|||
node-releases "^2.0.6"
|
||||
update-browserslist-db "^1.0.9"
|
||||
|
||||
browsertrix-behaviors@^0.4.1:
|
||||
version "0.4.1"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.4.1.tgz#dfa5beb66348df28ccbaad4ba66fe49ca0dde947"
|
||||
integrity sha512-zG6eNWqT9z+WvdUSSWg6de8NtYnGs7MHpzezKUgMJg+ze1+xtBgc6jkLT1d64eXC0cZQgwW+WV/e0yLAi1PyPA==
|
||||
browsertrix-behaviors@^0.4.2:
|
||||
version "0.4.2"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.4.2.tgz#830f4e37ddebf10dd923237dfd75c734d5d211e9"
|
||||
integrity sha512-5w6kPL3NB/BkmEGxWt3NT3iddAaSzMR1TtDPS7b66fM9kkhpCjCv/R/zR951jWDIeV3flJFBOy09uI5o8asPqg==
|
||||
|
||||
bser@2.1.1:
|
||||
version "2.1.1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue