Catch loading issues (#255)

* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'. 
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
This commit is contained in:
Ilya Kreymer 2023-03-20 18:31:37 -07:00 committed by GitHub
parent 07e503a8e6
commit 02fb137b2c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 190 additions and 105 deletions

View file

@ -4,8 +4,7 @@ import fs from "fs";
import os from "os";
import fsp from "fs/promises";
import { RedisCrawlState } from "./util/state.js";
//import AbortController from "abort-controller";
import { RedisCrawlState, LoadState } from "./util/state.js";
import Sitemapper from "sitemapper";
import { v4 as uuidv4 } from "uuid";
import yaml from "js-yaml";
@ -356,13 +355,6 @@ export class Crawler {
}
}
async teardownPage({workerid}) {
if (this.screencaster) {
logger.debug("End Screencast", {workerid}, "screencast");
await this.screencaster.stopById(workerid);
}
}
async crawlPage(opts) {
await this.writeStats();
@ -371,6 +363,7 @@ export class Crawler {
const {url} = data;
const logDetails = {page: url, workerid};
data.logDetails = logDetails;
data.workerid = workerid;
if (!this.isInScope(data, logDetails)) {
@ -381,7 +374,7 @@ export class Crawler {
// run custom driver here
await this.driver({page, data, crawler: this});
const title = await page.title();
data.title = await page.title();
if (this.params.screenshot) {
if (!data.isHTMLPage) {
@ -400,17 +393,18 @@ export class Crawler {
}
}
let text = "";
if (this.params.text && data.isHTMLPage) {
const result = await cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
text = await new TextExtract(result).parseTextFromDom();
data.text = await new TextExtract(result).parseTextFromDom();
}
await this.writePage(data, title, this.params.text ? text : null);
data.loadState = LoadState.EXTRACTION_DONE;
if (this.params.behaviorOpts) {
if (!data.isHTMLPage) {
logger.debug("Skipping behaviors for non-HTML page", logDetails, "behavior");
} else if (data.skipBehaviors) {
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
} else {
const behaviorTimeout = this.params.behaviorTimeout / 1000;
@ -424,23 +418,55 @@ export class Crawler {
if (res && res.length) {
logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
data.loadState = LoadState.BEHAVIORS_DONE;
}
}
}
logger.info("Page finished", logDetails, "pageStatus");
return true;
}
await this.crawlState.markFinished(url);
async pageFinished(data) {
await this.writePage(data);
await this.checkLimits();
// if page loaded, considered page finished successfully
// (even if behaviors timed out)
const { loadState, logDetails } = data;
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
logger.info("Page Finished", {loadState, ...logDetails}, "pageStatus");
await this.crawlState.markFinished(data.url);
if (this.healthChecker) {
this.healthChecker.resetErrors();
}
} else {
logger.warn("Page Load Failed", {loadState, ...logDetails}, "pageStatus");
await this.crawlState.markFailed(data.url);
if (this.healthChecker) {
this.healthChecker.incError();
}
}
await this.serializeConfig();
if (this.healthChecker) {
this.healthChecker.resetErrors();
}
await this.checkLimits();
}
return true;
async teardownPage({workerid}) {
if (this.screencaster) {
await this.screencaster.stopById(workerid);
}
}
async workerIdle(workerid) {
if (this.screencaster) {
//logger.debug("End Screencast", {workerid}, "screencast");
await this.screencaster.stopById(workerid, true);
}
}
async runBehaviors(page, frames, logDetails) {
@ -461,13 +487,22 @@ export class Crawler {
}
}
shouldIncludeFrame(frame, logDetails) {
async shouldIncludeFrame(frame, logDetails) {
if (!frame.parentFrame()) {
return true;
return frame;
}
const frameUrl = frame.url();
const frameElem = await frame.frameElement();
const tagName = await frame.evaluate(e => e.tagName, frameElem);
if (tagName !== "IFRAME" && tagName !== "FRAME") {
logger.debug("Skipping processing non-frame object", {tagName, frameUrl, ...logDetails}, "behavior");
return null;
}
let res;
if (frameUrl === "about:blank") {
@ -480,7 +515,7 @@ export class Crawler {
logger.debug("Skipping processing frame", {frameUrl, ...logDetails}, "behavior");
}
return res;
return res ? frame : null;
}
async getInfoString() {
@ -833,9 +868,9 @@ export class Crawler {
}
async loadPage(page, data, selectorOptsList = DEFAULT_SELECTORS) {
const {url, seedId, depth, workerid, extraHops = 0} = data;
const {url, seedId, depth, extraHops = 0} = data;
const logDetails = {page: url, workerid};
const logDetails = data.logDetails;
let isHTMLPage = await timedRun(
this.isHTML(url),
@ -877,6 +912,12 @@ export class Crawler {
ignoreAbort = shouldIgnoreAbort(req);
});
if (isHTMLPage) {
page.once("domcontentloaded", () => {
data.loadState = LoadState.CONTENT_LOADED;
});
}
const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
logger.info("Awaiting page load", logDetails);
@ -888,24 +929,35 @@ export class Crawler {
isHTMLPage = this.isHTMLContentType(contentType);
//if (this.healthChecker) {
// this.healthChecker.resetErrors();
//}
} catch (e) {
let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
const mainMessage = e.name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error";
logger.error(mainMessage, {msg, ...logDetails});
//if (this.healthChecker) {
// this.healthChecker.incError();
//}
if (e.name === "TimeoutError") {
if (data.loadState !== LoadState.CONTENT_LOADED) {
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
throw e;
} else {
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
data.skipBehaviors = true;
}
} else {
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
throw e;
}
}
}
data.loadState = LoadState.FULL_PAGE_LOADED;
data.isHTMLPage = isHTMLPage;
if (isHTMLPage) {
data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
let frames = await page.frames();
frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)));
data.filteredFrames = frames.filter((x) => x.status === "fulfilled" && x.value).map(x => x.value);
//data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
} else {
data.filteredFrames = [];
}
@ -934,13 +986,6 @@ export class Crawler {
}
}
async markPageFailed(url) {
if (this.healthChecker) {
this.healthChecker.incError();
}
await this.crawlState.markFailed(url);
}
async netIdle(page, details) {
if (!this.params.netIdleWait) {
return;
@ -1095,9 +1140,9 @@ export class Crawler {
}
}
async writePage({url, depth}, title, text) {
async writePage({url, depth, title, text, loadState}) {
const id = uuidv4();
const row = {"id": id, "url": url, "title": title};
const row = {id, url, title, loadState};
if (depth === 0) {
row.seed = true;
@ -1135,6 +1180,7 @@ export class Crawler {
} catch(e) {
// can't confirm not html, so try in browser
logger.debug("HEAD request failed", {...e, url});
return true;
}
}

View file

@ -12,7 +12,7 @@
},
"dependencies": {
"@novnc/novnc": "^1.4.0",
"browsertrix-behaviors": "^0.4.2",
"browsertrix-behaviors": "^0.5.0-beta.0",
"get-folder-size": "^4.0.0",
"ioredis": "^4.27.1",
"js-yaml": "^4.1.0",

View file

@ -69,6 +69,10 @@ export class Browser
return {page: this.firstPage, cdp: this.firstCDP};
}
numPages() {
return this.context ? this.context.pages().length : 0;
}
async newWindowPageWithCDP() {
// unique url to detect new pages
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);

View file

@ -175,19 +175,15 @@ class ScreenCaster
}
async screencastPage(page, cdp, id) {
//const id = uuidv4();
this.urls.set(id, page.url());
if (this.cdps.has(id)) {
// shouldn't happen, getting duplicate cdp
if (this.cdps.get(id) === cdp) {
logger.warn("worker already registered", {workerid: id}, "screencast");
return;
}
//const context = page.context();
this.cdps.set(id, cdp);
//this.urls.set(id, target.url());
const msg = "screencast";
@ -198,10 +194,11 @@ class ScreenCaster
logger.debug("screencastFrame", {workerid: id, url}, "screencast");
this.caches.set(id, data);
this.urls.set(id, url);
// keep previous data cached if just showing about:blank
if (url && !url.startsWith("about:blank")) {
this.caches.set(id, data);
this.urls.set(id, url);
await this.transport.sendAll({msg, id, data, url});
}
@ -223,7 +220,7 @@ class ScreenCaster
}
}
async stopById(id) {
async stopById(id, sendClose=false) {
this.caches.delete(id);
this.urls.delete(id);
@ -237,7 +234,9 @@ class ScreenCaster
}
}
//await this.transport.sendAll({msg: "close", id});
if (sendClose) {
await this.transport.sendAll({msg: "close", id});
}
this.cdps.delete(id);
}
@ -251,7 +250,7 @@ class ScreenCaster
logger.info("Started Screencast", {workerid: id}, "screencast");
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 2, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
}
async stopCast(cdp, id) {

View file

@ -3,6 +3,39 @@ import { logger } from "./logger.js";
import { MAX_DEPTH } from "./constants.js";
// ============================================================================
export const LoadState = {
FAILED: 0,
CONTENT_LOADED: 1,
FULL_PAGE_LOADED: 2,
EXTRACTION_DONE: 3,
BEHAVIORS_DONE: 4,
};
// ============================================================================
export class PageState
{
constructor(redisData) {
this.url = redisData.url;
this.seedId = redisData.seedId;
this.depth = redisData.depth;
this.extraHops = redisData.extraHops;
this.workerid = null;
this.title = null;
this.isHTMLPage = null;
this.text = null;
this.skipBehaviors = false;
this.filteredFrames = [];
this.loadState = LoadState.FAILED;
this.logDetails = {};
}
}
// ============================================================================
export class RedisCrawlState
{
@ -186,7 +219,7 @@ return 0;
await this.markStarted(data.url);
return data;
return new PageState(data);
}
async has(url) {

View file

@ -1,4 +1,4 @@
import { logger } from "./logger.js";
import { logger, errJSON } from "./logger.js";
export function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
@ -19,7 +19,7 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
if (err == "timeout reached") {
logger.error(message, {"seconds": seconds, ...logDetails}, context);
} else {
logger.error("Promise rejected", {...err, ...logDetails}, context);
logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
}
});
}

View file

@ -12,7 +12,8 @@ export function runWorkers(crawler, numWorkers, timeout) {
const workers = [];
for (let i = 0; i < numWorkers; i++) {
workers.push(new PageWorker(`worker-${i+1}`, crawler, timeout));
//workers.push(new PageWorker(`worker-${i+1}`, crawler, timeout));
workers.push(new PageWorker(i, crawler, timeout));
}
return Promise.allSettled(workers.map((worker) => worker.run()));
@ -33,35 +34,43 @@ export class PageWorker
this.opts = null;
this.failed = false;
this.logDetails = {workerid: this.id};
this.crashed = false;
this.markCrashed = null;
this.crashBreak = null;
}
async closePage() {
if (this.page) {
await this.crawler.teardownPage(this.opts);
try {
await this.cdp.detach();
} catch (e) {
// ignore
if (!this.crashed) {
await this.crawler.teardownPage(this.opts);
} else {
logger.debug("Closing crashed page", {workerid: this.id}, "worker");
}
this.cdp = null;
try {
await this.page.close();
} catch (e) {
// ignore
}
if (this.crashed) {
const numPagesRemaining = this.crawler.browser.numPages() - 1;
logger.debug("Skipping teardown of crashed page", {numPagesRemaining, workerid: this.id}, "worker");
}
this.cdp = null;
this.page = null;
}
}
async initPage() {
if (this.page && ++this.reuseCount <= MAX_REUSE) {
if (!this.crashed && this.page && ++this.reuseCount <= MAX_REUSE) {
logger.debug("Reusing page", {reuseCount: this.reuseCount}, "worker");
return this.opts;
} else {
} else if (this.page) {
await this.closePage();
}
@ -74,7 +83,7 @@ export class PageWorker
const { page, cdp } = await timedRun(
this.crawler.browser.newWindowPageWithCDP(),
NEW_WINDOW_TIMEOUT,
"New Window Timed Out!",
"New Window Timed Out",
{workerid},
"worker"
);
@ -84,13 +93,16 @@ export class PageWorker
this.opts = {page: this.page, cdp: this.cdp, workerid};
// updated per page crawl
this.failed = false;
this.crashed = false;
this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject);
this.logDetails = {page: this.page.url(), workerid};
// more serious page crash, mark as failed
this.page.on("crash", () => {
logger.error("Page Crash", ...this.logDetails, "worker");
this.failed = true;
this.page.on("crash", (details) => {
logger.error("Page Crash", {details, ...this.logDetails}, "worker");
this.crashed = true;
this.markCrashed("crashed");
});
await this.crawler.setupPage(this.opts);
@ -111,36 +123,30 @@ export class PageWorker
async timedCrawlPage(opts) {
const workerid = this.id;
const url = opts.data.url;
const { data } = opts;
const { url } = data;
logger.info("Starting page", {workerid, "page": url}, "worker");
this.logDetails = {page: url, workerid};
this.failed = false;
try {
const result = await timedRun(
this.crawler.crawlPage(opts),
this.timeout,
"Page Worker Timeout",
{workerid},
"worker"
);
this.failed = this.failed || !result;
await Promise.race([
timedRun(
this.crawler.crawlPage(opts),
this.timeout,
"Page Worker Timeout",
{workerid},
"worker"
),
this.crashBreak
]);
} catch (e) {
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
this.failed = true;
} finally {
if (this.failed) {
logger.warn("Page Load Failed", this.logDetails, "worker");
this.crawler.markPageFailed(url);
}
await this.crawler.pageFinished(data);
}
return this.failed;
}
async run() {
@ -166,17 +172,14 @@ export class PageWorker
const opts = await this.initPage();
// run timed crawl of page
const failed = await this.timedCrawlPage({...opts, data});
await this.timedCrawlPage({...opts, data});
// close page if failed
if (failed) {
logger.debug("Resetting failed page", {}, "worker");
await this.closePage();
}
} else {
// indicate that the worker has no more work (mostly for screencasting, status, etc...)
// depending on other works, will either get more work or crawl will end
this.crawler.workerIdle(this.id);
// otherwise, see if any pending urls
// check if any pending urls
const pending = await crawlState.numPending();
// if pending, sleep and check again

View file

@ -1079,10 +1079,10 @@ browserslist@^4.21.3:
node-releases "^2.0.6"
update-browserslist-db "^1.0.9"
browsertrix-behaviors@^0.4.2:
version "0.4.2"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.4.2.tgz#830f4e37ddebf10dd923237dfd75c734d5d211e9"
integrity sha512-5w6kPL3NB/BkmEGxWt3NT3iddAaSzMR1TtDPS7b66fM9kkhpCjCv/R/zR951jWDIeV3flJFBOy09uI5o8asPqg==
browsertrix-behaviors@^0.5.0-beta.0:
version "0.5.0-beta.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.0-beta.0.tgz#d1a7c35cda31d740a374df1e833f36bd1890768d"
integrity sha512-RQMQlbV4OBAzYyhTI7imoem8p4MTj2XSDzlIZvA5sC5U89OMnJ0VM5KBAJzET3PUJkQlUQEOTiXtnsnodHXTUQ==
bser@2.1.1:
version "2.1.1"