mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
QA Crawl Support (Beta) (#469)
Initial (beta) support for QA/replay crawling!
- Supports running a crawl over a given WACZ / list of WACZ (multi WACZ) input, hosted in ReplayWeb.page
- Runs local http server with full-page, ui-less ReplayWeb.page embed
- ReplayWeb.page release version configured in the Dockerfile, pinned ui.js and sw.js fetched directly from cdnjs
Can be deployed with `webrecorder/browsertrix-crawler qa` entrypoint.
- Requires `--qaSource`, pointing to WACZ or multi-WACZ json that will be replay/QAd
- Also supports `--qaRedisKey` where QA comparison data will be pushed, if specified.
- Supports `--qaDebugImageDiff` for outputting crawl / replay/ diff
images.
- If using --writePagesToRedis, a `comparison` key is added to existing page data where:
```
comparison: {
screenshotMatch?: number;
textMatch?: number;
resourceCounts: {
crawlGood?: number;
crawlBad?: number;
replayGood?: number;
replayBad?: number;
};
};
```
- bump version to 1.1.0-beta.2
This commit is contained in:
parent
22a7351dc7
commit
bb9c82493b
22 changed files with 2068 additions and 598 deletions
187
src/crawler.ts
187
src/crawler.ts
|
|
@ -13,6 +13,8 @@ import {
|
|||
PageCallbacks,
|
||||
} from "./util/state.js";
|
||||
|
||||
import { parseArgs } from "./util/argParser.js";
|
||||
|
||||
import yaml from "js-yaml";
|
||||
|
||||
import * as warcio from "warcio";
|
||||
|
|
@ -29,7 +31,6 @@ import {
|
|||
} from "./util/storage.js";
|
||||
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
||||
import { Screenshots } from "./util/screenshots.js";
|
||||
import { parseArgs } from "./util/argParser.js";
|
||||
import { initRedis } from "./util/redis.js";
|
||||
import { logger, formatErr } from "./util/logger.js";
|
||||
import {
|
||||
|
|
@ -57,6 +58,7 @@ import { OriginOverride } from "./util/originoverride.js";
|
|||
import { Agent as HTTPAgent } from "http";
|
||||
import { Agent as HTTPSAgent } from "https";
|
||||
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
|
||||
import { Recorder } from "./util/recorder.js";
|
||||
import { SitemapReader } from "./util/sitemapper.js";
|
||||
import { ScopedSeed } from "./util/seeds.js";
|
||||
|
||||
|
|
@ -146,6 +148,8 @@ export class Crawler {
|
|||
pagesDir: string;
|
||||
pagesFile: string;
|
||||
|
||||
archivesDir: string;
|
||||
|
||||
blockRules: BlockRules | null;
|
||||
adBlockRules: AdBlockRules | null;
|
||||
|
||||
|
|
@ -154,11 +158,15 @@ export class Crawler {
|
|||
|
||||
screencaster: ScreenCaster | null = null;
|
||||
|
||||
skipTextDocs = 0;
|
||||
|
||||
interrupted = false;
|
||||
finalExit = false;
|
||||
uploadAndDeleteLocal = false;
|
||||
done = false;
|
||||
|
||||
textInPages = false;
|
||||
|
||||
customBehaviors = "";
|
||||
behaviorsChecked = false;
|
||||
behaviorLastLine?: string;
|
||||
|
|
@ -178,10 +186,12 @@ export class Crawler {
|
|||
crawler: Crawler;
|
||||
}) => NonNullable<unknown>;
|
||||
|
||||
recording = true;
|
||||
|
||||
constructor() {
|
||||
const res = parseArgs();
|
||||
this.params = res.parsed;
|
||||
this.origConfig = res.origConfig;
|
||||
const args = this.parseArgs();
|
||||
this.params = args.parsed;
|
||||
this.origConfig = args.origConfig;
|
||||
|
||||
// root collections dir
|
||||
this.collDir = path.join(
|
||||
|
|
@ -259,6 +269,9 @@ export class Crawler {
|
|||
// pages file
|
||||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
|
||||
// archives dir
|
||||
this.archivesDir = path.join(this.collDir, "archive");
|
||||
|
||||
this.blockRules = null;
|
||||
this.adBlockRules = null;
|
||||
|
||||
|
|
@ -268,6 +281,8 @@ export class Crawler {
|
|||
this.finalExit = false;
|
||||
this.uploadAndDeleteLocal = false;
|
||||
|
||||
this.textInPages = this.params.text.includes("to-pages");
|
||||
|
||||
this.done = false;
|
||||
|
||||
this.customBehaviors = "";
|
||||
|
|
@ -281,6 +296,10 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
protected parseArgs() {
|
||||
return parseArgs();
|
||||
}
|
||||
|
||||
configureUA() {
|
||||
// override userAgent
|
||||
if (this.params.userAgent) {
|
||||
|
|
@ -434,7 +453,9 @@ export class Crawler {
|
|||
// logger.info("wb-manager init failed, collection likely already exists");
|
||||
//}
|
||||
|
||||
fs.mkdirSync(this.logDir, { recursive: true });
|
||||
await fsp.mkdir(this.logDir, { recursive: true });
|
||||
await fsp.mkdir(this.archivesDir, { recursive: true });
|
||||
|
||||
this.logFH = fs.createWriteStream(this.logFilename);
|
||||
logger.setExternalLogStream(this.logFH);
|
||||
|
||||
|
|
@ -721,10 +742,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return "";
|
||||
}
|
||||
|
||||
async crawlPage(opts: WorkerState) {
|
||||
async crawlPage(opts: WorkerState): Promise<void> {
|
||||
await this.writeStats();
|
||||
|
||||
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
|
||||
const { page, data, workerid, callbacks, directFetchCapture } = opts;
|
||||
data.callbacks = callbacks;
|
||||
|
||||
const { url } = data;
|
||||
|
|
@ -764,7 +785,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
{ url, ...logDetails },
|
||||
"fetch",
|
||||
);
|
||||
return true;
|
||||
return;
|
||||
}
|
||||
} catch (e) {
|
||||
// filtered out direct fetch
|
||||
|
|
@ -782,7 +803,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
data.title = await page.title();
|
||||
data.favicon = await this.getFavicon(page, logDetails);
|
||||
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
await this.doPostLoadActions(opts);
|
||||
}
|
||||
|
||||
async doPostLoadActions(opts: WorkerState, saveOutput = false) {
|
||||
const { page, cdp, data, workerid } = opts;
|
||||
const { url } = data;
|
||||
|
||||
const logDetails = { page: url, workerid };
|
||||
|
||||
if (this.params.screenshot) {
|
||||
if (!data.isHTMLPage) {
|
||||
|
|
@ -793,10 +821,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
browser: this.browser,
|
||||
page,
|
||||
url,
|
||||
directory: archiveDir,
|
||||
directory: this.archivesDir,
|
||||
});
|
||||
if (this.params.screenshot.includes("view")) {
|
||||
await screenshots.take();
|
||||
await screenshots.take("view", saveOutput ? data : null);
|
||||
}
|
||||
if (this.params.screenshot.includes("fullPage")) {
|
||||
await screenshots.takeFullPage();
|
||||
|
|
@ -812,15 +840,16 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
textextract = new TextExtractViaSnapshot(cdp, {
|
||||
warcPrefix: this.warcPrefix,
|
||||
url,
|
||||
directory: archiveDir,
|
||||
directory: this.archivesDir,
|
||||
skipDocs: this.skipTextDocs,
|
||||
});
|
||||
const { changed, text } = await textextract.extractAndStoreText(
|
||||
const { text } = await textextract.extractAndStoreText(
|
||||
"text",
|
||||
false,
|
||||
this.params.text.includes("to-warc"),
|
||||
);
|
||||
|
||||
if (changed && text && this.params.text.includes("to-pages")) {
|
||||
if (text && (this.textInPages || saveOutput)) {
|
||||
data.text = text;
|
||||
}
|
||||
}
|
||||
|
|
@ -868,8 +897,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
);
|
||||
await sleep(this.params.pageExtraDelay);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async pageFinished(data: PageState) {
|
||||
|
|
@ -1047,8 +1074,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async checkLimits() {
|
||||
let interrupt = false;
|
||||
|
||||
const dir = path.join(this.collDir, "archive");
|
||||
const size = await getDirSize(dir);
|
||||
const size = await getDirSize(this.archivesDir);
|
||||
|
||||
await this.crawlState.setArchiveSize(size);
|
||||
|
||||
|
|
@ -1230,28 +1256,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
this.screencaster = this.initScreenCaster();
|
||||
|
||||
if (this.params.originOverride.length) {
|
||||
if (this.params.originOverride && this.params.originOverride.length) {
|
||||
this.originOverride = new OriginOverride(this.params.originOverride);
|
||||
}
|
||||
|
||||
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
||||
const seed = this.params.scopedSeeds[i];
|
||||
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
|
||||
if (this.limitHit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (seed.sitemap) {
|
||||
await timedRun(
|
||||
this.parseSitemap(seed, i),
|
||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||
"Sitemap initial fetch timed out",
|
||||
{ sitemap: seed.sitemap, seed: seed.url },
|
||||
"sitemap",
|
||||
);
|
||||
}
|
||||
}
|
||||
await this._addInitialSeeds();
|
||||
|
||||
await this.browser.launch({
|
||||
profileUrl: this.params.profile,
|
||||
|
|
@ -1272,12 +1281,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"browser",
|
||||
);
|
||||
},
|
||||
|
||||
recording: this.recording,
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} as any);
|
||||
|
||||
// --------------
|
||||
// Run Crawl Here!
|
||||
await runWorkers(this, this.params.workers, this.maxPageTime, this.collDir);
|
||||
await runWorkers(this, this.params.workers, this.maxPageTime);
|
||||
// --------------
|
||||
|
||||
await this.serializeConfig(true);
|
||||
|
|
@ -1297,6 +1308,27 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.postCrawl();
|
||||
}
|
||||
|
||||
protected async _addInitialSeeds() {
|
||||
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
||||
const seed = this.params.scopedSeeds[i];
|
||||
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
|
||||
if (this.limitHit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (seed.sitemap) {
|
||||
await timedRun(
|
||||
this.parseSitemap(seed, i),
|
||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||
"Sitemap initial fetch timed out",
|
||||
{ sitemap: seed.sitemap, seed: seed.url },
|
||||
"sitemap",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async postCrawl() {
|
||||
if (this.params.combineWARC) {
|
||||
await this.combineWARC();
|
||||
|
|
@ -1307,9 +1339,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
|
||||
await this.crawlState.setStatus("generate-cdx");
|
||||
|
||||
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||
const warcList = await fsp.readdir(this.archivesDir);
|
||||
const warcListFull = warcList.map((filename) =>
|
||||
path.join(this.collDir, "archive", filename),
|
||||
path.join(this.archivesDir, filename),
|
||||
);
|
||||
|
||||
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
|
|
@ -1377,10 +1409,8 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info("Generating WACZ");
|
||||
await this.crawlState.setStatus("generate-wacz");
|
||||
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
|
||||
// Get a list of the warcs inside
|
||||
const warcFileList = await fsp.readdir(archiveDir);
|
||||
const warcFileList = await fsp.readdir(this.archivesDir);
|
||||
|
||||
// is finished (>0 pages and all pages written)
|
||||
const isFinished = await this.crawlState.isFinished();
|
||||
|
|
@ -1440,7 +1470,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
createArgs.push("-f");
|
||||
|
||||
warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val)));
|
||||
warcFileList.forEach((val) =>
|
||||
createArgs.push(path.join(this.archivesDir, val)),
|
||||
);
|
||||
|
||||
// create WACZ
|
||||
const waczResult = await this.awaitProcess(
|
||||
|
|
@ -1900,13 +1932,15 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
depth: number,
|
||||
extraHops: number,
|
||||
logDetails: LogDetails = {},
|
||||
ts = 0,
|
||||
pageid?: string,
|
||||
) {
|
||||
if (this.limitHit) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const result = await this.crawlState.addToQueue(
|
||||
{ url, seedId, depth, extraHops },
|
||||
{ url, seedId, depth, extraHops, ts, pageid },
|
||||
this.pageLimit,
|
||||
);
|
||||
|
||||
|
|
@ -1954,7 +1988,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
id: "pages",
|
||||
title: "All Pages",
|
||||
};
|
||||
header["hasText"] = this.params.text.includes("to-pages");
|
||||
header["hasText"] = String(this.textInPages);
|
||||
if (this.params.text.length) {
|
||||
logger.debug("Text Extraction: " + this.params.text.join(","));
|
||||
} else {
|
||||
|
|
@ -1968,20 +2002,30 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
async writePage({
|
||||
pageid,
|
||||
url,
|
||||
depth,
|
||||
title,
|
||||
text,
|
||||
loadState,
|
||||
mime,
|
||||
favicon,
|
||||
ts,
|
||||
status,
|
||||
}: PageState) {
|
||||
const row: PageEntry = { id: pageid!, url, title, loadState };
|
||||
protected pageEntryForRedis(
|
||||
entry: Record<string, string | number | boolean | object>,
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
state: PageState,
|
||||
) {
|
||||
return entry;
|
||||
}
|
||||
|
||||
async writePage(state: PageState) {
|
||||
const {
|
||||
pageid,
|
||||
url,
|
||||
depth,
|
||||
title,
|
||||
text,
|
||||
loadState,
|
||||
mime,
|
||||
favicon,
|
||||
status,
|
||||
} = state;
|
||||
|
||||
const row: PageEntry = { id: pageid, url, title, loadState };
|
||||
|
||||
let { ts } = state;
|
||||
if (!ts) {
|
||||
ts = new Date();
|
||||
logger.warn("Page date missing, setting to now", { url, ts });
|
||||
|
|
@ -1998,14 +2042,16 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
if (this.params.writePagesToRedis) {
|
||||
await this.crawlState.writeToPagesQueue(JSON.stringify(row));
|
||||
await this.crawlState.writeToPagesQueue(
|
||||
JSON.stringify(this.pageEntryForRedis(row, state)),
|
||||
);
|
||||
}
|
||||
|
||||
if (depth === 0) {
|
||||
row.seed = true;
|
||||
}
|
||||
|
||||
if (text) {
|
||||
if (text && this.textInPages) {
|
||||
row.text = text;
|
||||
}
|
||||
|
||||
|
|
@ -2151,7 +2197,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.crawlState.setStatus("generate-warc");
|
||||
|
||||
// Get the list of created Warcs
|
||||
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||
const warcLists = await fsp.readdir(this.archivesDir);
|
||||
|
||||
logger.debug(`Combining ${warcLists.length} WARCs...`);
|
||||
|
||||
|
|
@ -2159,7 +2205,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
|
||||
for (let i = 0; i < warcLists.length; i++) {
|
||||
const fileName = path.join(this.collDir, "archive", warcLists[i]);
|
||||
const fileName = path.join(this.archivesDir, warcLists[i]);
|
||||
const fileSize = await getFileSize(fileName);
|
||||
fileSizeObjects.push({ fileSize: fileSize, fileName: fileName });
|
||||
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
|
||||
|
|
@ -2316,6 +2362,21 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.storage.uploadFile(filename, targetFilename);
|
||||
}
|
||||
}
|
||||
|
||||
createRecorder(id: number): Recorder | null {
|
||||
if (!this.recording) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const res = new Recorder({
|
||||
workerid: id,
|
||||
collDir: this.collDir,
|
||||
crawler: this,
|
||||
});
|
||||
|
||||
this.browser.recorders.push(res);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
function shouldIgnoreAbort(req: HTTPRequest) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue