mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
improve logging / healthcheck responsiveness:
- avoid double logging of exceptions - ensure page is included in all errors - treat some timeouts as warnings - remove unused responseHeaders call in browser - fix screenshot err logging healthcheck: - interrupt crawl directly after healthcheck fails
This commit is contained in:
parent
1c486ea1f3
commit
416a62fb7a
7 changed files with 140 additions and 97 deletions
168
crawler.js
168
crawler.js
|
@ -546,6 +546,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (this.healthChecker) {
|
||||
this.healthChecker.incError();
|
||||
}
|
||||
this.interrupted = this.healthChecker.isFailing();
|
||||
if (this.interrupted) {
|
||||
logger.warn("Interrupting due to too many failures, will restart");
|
||||
}
|
||||
}
|
||||
|
||||
await this.serializeConfig();
|
||||
|
@ -992,7 +996,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.debug(stdout.join("\n"));
|
||||
}
|
||||
if (stderr.length && this.params.logging.includes("debug")) {
|
||||
logger.error(stderr.join("\n"));
|
||||
logger.debug(stderr.join("\n"));
|
||||
}
|
||||
resolve(code);
|
||||
});
|
||||
|
@ -1046,20 +1050,24 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const failCrawlOnError = ((depth === 0) && this.params.failOnFailedSeed);
|
||||
|
||||
let isHTMLPage = await timedRun(
|
||||
data.isHTMLPage = await timedRun(
|
||||
this.isHTML(url),
|
||||
FETCH_TIMEOUT_SECS,
|
||||
"HEAD request to determine if URL is HTML page timed out",
|
||||
logDetails
|
||||
logDetails,
|
||||
"fetch",
|
||||
true
|
||||
);
|
||||
|
||||
if (!isHTMLPage) {
|
||||
if (!data.isHTMLPage) {
|
||||
try {
|
||||
const captureResult = await timedRun(
|
||||
this.directFetchCapture(url),
|
||||
FETCH_TIMEOUT_SECS,
|
||||
"Direct fetch capture attempt timed out",
|
||||
logDetails
|
||||
logDetails,
|
||||
"fetch",
|
||||
true
|
||||
);
|
||||
if (captureResult) {
|
||||
logger.info("Direct fetch successful", {url, ...logDetails}, "fetch");
|
||||
|
@ -1070,73 +1078,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
let ignoreAbort = false;
|
||||
|
||||
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
|
||||
// if so, don't report as an error
|
||||
page.once("requestfailed", (req) => {
|
||||
ignoreAbort = shouldIgnoreAbort(req);
|
||||
});
|
||||
|
||||
if (isHTMLPage) {
|
||||
page.once("domcontentloaded", () => {
|
||||
data.loadState = LoadState.CONTENT_LOADED;
|
||||
});
|
||||
}
|
||||
|
||||
const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
|
||||
|
||||
logger.info("Awaiting page load", logDetails);
|
||||
|
||||
try {
|
||||
const resp = await page.goto(url, gotoOpts);
|
||||
|
||||
// Handle 4xx or 5xx response as a page load error
|
||||
const statusCode = resp.status();
|
||||
if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
|
||||
if (failCrawlOnError) {
|
||||
logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
|
||||
} else {
|
||||
logger.error("Page Load Error, skipping page", {statusCode, ...logDetails});
|
||||
throw new Error(`Page ${url} returned status code ${statusCode}`);
|
||||
}
|
||||
}
|
||||
|
||||
const contentType = await this.browser.responseHeader(resp, "content-type");
|
||||
|
||||
isHTMLPage = this.isHTMLContentType(contentType);
|
||||
|
||||
} catch (e) {
|
||||
let msg = e.message || "";
|
||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||
if (e.name === "TimeoutError") {
|
||||
if (data.loadState !== LoadState.CONTENT_LOADED) {
|
||||
if (failCrawlOnError) {
|
||||
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
|
||||
} else {
|
||||
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
|
||||
throw e;
|
||||
}
|
||||
} else {
|
||||
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
|
||||
data.skipBehaviors = true;
|
||||
}
|
||||
} else {
|
||||
if (failCrawlOnError) {
|
||||
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
|
||||
} else {
|
||||
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
await this.doGotoPage(page, url, data, logDetails, failCrawlOnError);
|
||||
|
||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
||||
|
||||
data.isHTMLPage = isHTMLPage;
|
||||
|
||||
if (isHTMLPage) {
|
||||
if (data.isHTMLPage) {
|
||||
let frames = await page.frames();
|
||||
frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)));
|
||||
|
||||
|
@ -1147,7 +1093,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
data.filteredFrames = [];
|
||||
}
|
||||
|
||||
if (!isHTMLPage) {
|
||||
if (data.isHTMLPage) {
|
||||
logger.debug("Skipping link extraction for non-HTML page", logDetails);
|
||||
return;
|
||||
}
|
||||
|
@ -1172,6 +1118,82 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
async doGotoPage(page, url, data, logDetails, failCrawlOnError) {
|
||||
let ignoreAbort = false;
|
||||
|
||||
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
|
||||
// if so, don't report as an error
|
||||
page.once("requestfailed", (req) => {
|
||||
ignoreAbort = shouldIgnoreAbort(req);
|
||||
});
|
||||
|
||||
if (data.isHTMLPage) {
|
||||
page.once("domcontentloaded", () => {
|
||||
data.loadState = LoadState.CONTENT_LOADED;
|
||||
});
|
||||
}
|
||||
|
||||
const gotoOpts = data.isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
|
||||
|
||||
logger.info("Awaiting page load", logDetails);
|
||||
|
||||
let mainPageResp = null;
|
||||
|
||||
try {
|
||||
mainPageResp = await page.goto(url, gotoOpts);
|
||||
|
||||
} catch (e) {
|
||||
let msg = e.message || "";
|
||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||
if (e.name === "TimeoutError") {
|
||||
if (data.loadState !== LoadState.CONTENT_LOADED) {
|
||||
if (failCrawlOnError) {
|
||||
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
|
||||
} else {
|
||||
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
|
||||
e.message = "logged";
|
||||
throw e;
|
||||
}
|
||||
} else {
|
||||
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
|
||||
data.skipBehaviors = true;
|
||||
}
|
||||
} else {
|
||||
if (failCrawlOnError) {
|
||||
logger.fatal("Seed Page Load Error, failing crawl", {msg, ...logDetails});
|
||||
} else {
|
||||
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
|
||||
e.message = "logged";
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!mainPageResp) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle 4xx or 5xx response as a page load error
|
||||
const statusCode = mainPageResp.status();
|
||||
|
||||
if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
|
||||
if (failCrawlOnError) {
|
||||
logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
|
||||
} else {
|
||||
logger.error("Invalid status Code, skipping page", {statusCode, ...logDetails});
|
||||
throw new Error("logged");
|
||||
}
|
||||
}
|
||||
|
||||
// attempt to determine content-type more accurately, ignore if failed
|
||||
try {
|
||||
data.isHTMLPage = this.isHTMLContentType(mainPageResp.headers()["content-type"]);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
async netIdle(page, details) {
|
||||
if (!this.params.netIdleWait) {
|
||||
return;
|
||||
|
@ -1267,7 +1289,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
while (await timedRun(
|
||||
page.$("div.cf-browser-verification.cf-im-under-attack"),
|
||||
PAGE_OP_TIMEOUT_SECS
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
"Cloudflare check timed out",
|
||||
logDetails,
|
||||
"general",
|
||||
true
|
||||
)) {
|
||||
logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
|
||||
await sleep(5.5);
|
||||
|
|
|
@ -280,10 +280,6 @@ export class Browser extends BaseBrowser
|
|||
return {page, cdp};
|
||||
}
|
||||
|
||||
async responseHeader(resp, header) {
|
||||
return await resp.headers()[header];
|
||||
}
|
||||
|
||||
async evaluateWithCLI(_, frame, cdp, funcString, logData, contextName) {
|
||||
const context = await frame.executionContext();
|
||||
cdp = context._client;
|
||||
|
|
|
@ -10,17 +10,22 @@ export class HealthChecker
|
|||
this.port = port;
|
||||
this.errorCount = 0;
|
||||
this.errorThreshold = errorThreshold;
|
||||
this.resetCount = 0;
|
||||
|
||||
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
|
||||
logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
|
||||
this.healthServer.listen(port);
|
||||
}
|
||||
|
||||
isFailing() {
|
||||
return this.errorCount >= this.errorThreshold;
|
||||
}
|
||||
|
||||
async healthCheck(req, res) {
|
||||
const pathname = url.parse(req.url).pathname;
|
||||
switch (pathname) {
|
||||
case "/healthz":
|
||||
if (this.errorCount < this.errorThreshold) {
|
||||
if (!this.isFailing()) {
|
||||
logger.debug(`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, {}, "healthcheck");
|
||||
res.writeHead(200);
|
||||
res.end();
|
||||
|
@ -34,9 +39,13 @@ export class HealthChecker
|
|||
}
|
||||
|
||||
resetErrors() {
|
||||
if (this.errorCount > 0) {
|
||||
this.resetCount++;
|
||||
if (this.errorCount > 0 && this.resetCount >= 2) {
|
||||
logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, {}, "healthcheck");
|
||||
this.errorCount = 0;
|
||||
this.resetCount = 0;
|
||||
} else if (!this.errorCount) {
|
||||
this.resetCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import path from "path";
|
|||
import * as warcio from "warcio";
|
||||
import sharp from "sharp";
|
||||
|
||||
import { logger } from "./logger.js";
|
||||
import { logger, errJSON } from "./logger.js";
|
||||
|
||||
// ============================================================================
|
||||
|
||||
|
@ -47,7 +47,7 @@ export class Screenshots {
|
|||
await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
|
||||
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
||||
} catch (e) {
|
||||
logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -56,8 +56,8 @@ export class Screenshots {
|
|||
}
|
||||
|
||||
async takeThumbnail() {
|
||||
try {
|
||||
const screenshotType = "thumbnail";
|
||||
try {
|
||||
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
||||
const options = screenshotTypes[screenshotType];
|
||||
const screenshotBuffer = await this.page.screenshot(options);
|
||||
|
@ -68,7 +68,7 @@ export class Screenshots {
|
|||
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
|
||||
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
|
||||
} catch (e) {
|
||||
logger.error(`Taking screenshot (type: thumbnail) failed for ${this.url}`, e.message);
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -134,7 +134,7 @@ end
|
|||
});
|
||||
|
||||
redis.defineCommand("requeue", {
|
||||
numberOfKeys: 3,
|
||||
numberOfKeys: 4,
|
||||
lua: `
|
||||
local res = redis.call('get', KEYS[3]);
|
||||
if not res then
|
||||
|
@ -148,6 +148,9 @@ if not res then
|
|||
redis.call('zadd', KEYS[2], 0, json);
|
||||
return 1;
|
||||
else
|
||||
data['failed'] = '1';
|
||||
json = cjson.encode(data);
|
||||
redis.call('lpush', KEYS[3], json);
|
||||
return 2;
|
||||
end
|
||||
end
|
||||
|
@ -179,7 +182,8 @@ return 0;
|
|||
}
|
||||
|
||||
async markFailed(url) {
|
||||
await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
|
||||
//await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
|
||||
await this.requeueOrFail(url);
|
||||
|
||||
return await this.redis.incr(this.dkey);
|
||||
}
|
||||
|
@ -416,7 +420,12 @@ return 0;
|
|||
const pendingUrls = await this.redis.hkeys(this.pkey);
|
||||
|
||||
for (const url of pendingUrls) {
|
||||
const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url, this.maxRetryPending);
|
||||
await this.requeueOrFail(url);
|
||||
}
|
||||
}
|
||||
|
||||
async requeueOrFail(url) {
|
||||
const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, this.fkey, url, this.maxRetryPending);
|
||||
switch (res) {
|
||||
case 1:
|
||||
logger.info(`Requeued: ${url}`);
|
||||
|
@ -426,7 +435,7 @@ return 0;
|
|||
logger.info(`Not requeuing anymore: ${url}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return res === 1;
|
||||
}
|
||||
|
||||
async queueSize() {
|
||||
|
|
|
@ -4,7 +4,7 @@ export function sleep(seconds) {
|
|||
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
||||
}
|
||||
|
||||
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general") {
|
||||
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) {
|
||||
// return Promise return value or log error if timeout is reached first
|
||||
const timeout = seconds * 1000;
|
||||
|
||||
|
@ -17,7 +17,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
|
|||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
|
||||
.catch((err) => {
|
||||
if (err == "timeout reached") {
|
||||
logger.error(message, {"seconds": seconds, ...logDetails}, context);
|
||||
const logFunc = isWarn ? logger.warn : logger.error;
|
||||
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
|
||||
} else {
|
||||
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
|
||||
throw err;
|
||||
|
|
|
@ -167,14 +167,16 @@ export class PageWorker
|
|||
this.crawler.crawlPage(opts),
|
||||
this.maxPageTime,
|
||||
"Page Worker Timeout",
|
||||
{workerid},
|
||||
this.logDetails,
|
||||
"worker"
|
||||
),
|
||||
this.crashBreak
|
||||
]);
|
||||
|
||||
} catch (e) {
|
||||
if (e.message !== "logged") {
|
||||
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
|
||||
}
|
||||
} finally {
|
||||
await this.crawler.pageFinished(data);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue