QA Crawl Support (Beta) (#469)

Initial (beta) support for QA/replay crawling!
- Supports running a crawl over a given WACZ / list of WACZ (multi WACZ) input, hosted in ReplayWeb.page
- Runs local http server with full-page, ui-less ReplayWeb.page embed
- ReplayWeb.page release version configured in the Dockerfile, pinned ui.js and sw.js fetched directly from cdnjs

Can be deployed with `webrecorder/browsertrix-crawler qa` entrypoint.
- Requires `--qaSource`, pointing to WACZ or multi-WACZ json that will be replay/QAd
- Also supports `--qaRedisKey` where QA comparison data will be pushed, if specified.
- Supports `--qaDebugImageDiff` for outputting crawl / replay/ diff
images.
- If using --writePagesToRedis, a `comparison` key is added to existing page data where:
```
  comparison: {
    screenshotMatch?: number;
    textMatch?: number;
    resourceCounts: {
      crawlGood?: number;
      crawlBad?: number;
      replayGood?: number;
      replayBad?: number;
    };
  };
  ```
- bump version to 1.1.0-beta.2
This commit is contained in:
Ilya Kreymer 2024-03-22 17:32:42 -07:00 committed by GitHub
parent 22a7351dc7
commit bb9c82493b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 2068 additions and 598 deletions

View file

@ -13,6 +13,8 @@ import {
PageCallbacks,
} from "./util/state.js";
import { parseArgs } from "./util/argParser.js";
import yaml from "js-yaml";
import * as warcio from "warcio";
@ -29,7 +31,6 @@ import {
} from "./util/storage.js";
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js";
import { parseArgs } from "./util/argParser.js";
import { initRedis } from "./util/redis.js";
import { logger, formatErr } from "./util/logger.js";
import {
@ -57,6 +58,7 @@ import { OriginOverride } from "./util/originoverride.js";
import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
@ -146,6 +148,8 @@ export class Crawler {
pagesDir: string;
pagesFile: string;
archivesDir: string;
blockRules: BlockRules | null;
adBlockRules: AdBlockRules | null;
@ -154,11 +158,15 @@ export class Crawler {
screencaster: ScreenCaster | null = null;
skipTextDocs = 0;
interrupted = false;
finalExit = false;
uploadAndDeleteLocal = false;
done = false;
textInPages = false;
customBehaviors = "";
behaviorsChecked = false;
behaviorLastLine?: string;
@ -178,10 +186,12 @@ export class Crawler {
crawler: Crawler;
}) => NonNullable<unknown>;
recording = true;
constructor() {
const res = parseArgs();
this.params = res.parsed;
this.origConfig = res.origConfig;
const args = this.parseArgs();
this.params = args.parsed;
this.origConfig = args.origConfig;
// root collections dir
this.collDir = path.join(
@ -259,6 +269,9 @@ export class Crawler {
// pages file
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
// archives dir
this.archivesDir = path.join(this.collDir, "archive");
this.blockRules = null;
this.adBlockRules = null;
@ -268,6 +281,8 @@ export class Crawler {
this.finalExit = false;
this.uploadAndDeleteLocal = false;
this.textInPages = this.params.text.includes("to-pages");
this.done = false;
this.customBehaviors = "";
@ -281,6 +296,10 @@ export class Crawler {
}
}
protected parseArgs() {
return parseArgs();
}
configureUA() {
// override userAgent
if (this.params.userAgent) {
@ -434,7 +453,9 @@ export class Crawler {
// logger.info("wb-manager init failed, collection likely already exists");
//}
fs.mkdirSync(this.logDir, { recursive: true });
await fsp.mkdir(this.logDir, { recursive: true });
await fsp.mkdir(this.archivesDir, { recursive: true });
this.logFH = fs.createWriteStream(this.logFilename);
logger.setExternalLogStream(this.logFH);
@ -721,10 +742,10 @@ self.__bx_behaviors.selectMainBehavior();
return "";
}
async crawlPage(opts: WorkerState) {
async crawlPage(opts: WorkerState): Promise<void> {
await this.writeStats();
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
const { page, data, workerid, callbacks, directFetchCapture } = opts;
data.callbacks = callbacks;
const { url } = data;
@ -764,7 +785,7 @@ self.__bx_behaviors.selectMainBehavior();
{ url, ...logDetails },
"fetch",
);
return true;
return;
}
} catch (e) {
// filtered out direct fetch
@ -782,7 +803,14 @@ self.__bx_behaviors.selectMainBehavior();
data.title = await page.title();
data.favicon = await this.getFavicon(page, logDetails);
const archiveDir = path.join(this.collDir, "archive");
await this.doPostLoadActions(opts);
}
async doPostLoadActions(opts: WorkerState, saveOutput = false) {
const { page, cdp, data, workerid } = opts;
const { url } = data;
const logDetails = { page: url, workerid };
if (this.params.screenshot) {
if (!data.isHTMLPage) {
@ -793,10 +821,10 @@ self.__bx_behaviors.selectMainBehavior();
browser: this.browser,
page,
url,
directory: archiveDir,
directory: this.archivesDir,
});
if (this.params.screenshot.includes("view")) {
await screenshots.take();
await screenshots.take("view", saveOutput ? data : null);
}
if (this.params.screenshot.includes("fullPage")) {
await screenshots.takeFullPage();
@ -812,15 +840,16 @@ self.__bx_behaviors.selectMainBehavior();
textextract = new TextExtractViaSnapshot(cdp, {
warcPrefix: this.warcPrefix,
url,
directory: archiveDir,
directory: this.archivesDir,
skipDocs: this.skipTextDocs,
});
const { changed, text } = await textextract.extractAndStoreText(
const { text } = await textextract.extractAndStoreText(
"text",
false,
this.params.text.includes("to-warc"),
);
if (changed && text && this.params.text.includes("to-pages")) {
if (text && (this.textInPages || saveOutput)) {
data.text = text;
}
}
@ -868,8 +897,6 @@ self.__bx_behaviors.selectMainBehavior();
);
await sleep(this.params.pageExtraDelay);
}
return true;
}
async pageFinished(data: PageState) {
@ -1047,8 +1074,7 @@ self.__bx_behaviors.selectMainBehavior();
async checkLimits() {
let interrupt = false;
const dir = path.join(this.collDir, "archive");
const size = await getDirSize(dir);
const size = await getDirSize(this.archivesDir);
await this.crawlState.setArchiveSize(size);
@ -1230,28 +1256,11 @@ self.__bx_behaviors.selectMainBehavior();
this.screencaster = this.initScreenCaster();
if (this.params.originOverride.length) {
if (this.params.originOverride && this.params.originOverride.length) {
this.originOverride = new OriginOverride(this.params.originOverride);
}
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
const seed = this.params.scopedSeeds[i];
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
if (this.limitHit) {
break;
}
}
if (seed.sitemap) {
await timedRun(
this.parseSitemap(seed, i),
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
"Sitemap initial fetch timed out",
{ sitemap: seed.sitemap, seed: seed.url },
"sitemap",
);
}
}
await this._addInitialSeeds();
await this.browser.launch({
profileUrl: this.params.profile,
@ -1272,12 +1281,14 @@ self.__bx_behaviors.selectMainBehavior();
"browser",
);
},
recording: this.recording,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} as any);
// --------------
// Run Crawl Here!
await runWorkers(this, this.params.workers, this.maxPageTime, this.collDir);
await runWorkers(this, this.params.workers, this.maxPageTime);
// --------------
await this.serializeConfig(true);
@ -1297,6 +1308,27 @@ self.__bx_behaviors.selectMainBehavior();
await this.postCrawl();
}
protected async _addInitialSeeds() {
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
const seed = this.params.scopedSeeds[i];
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
if (this.limitHit) {
break;
}
}
if (seed.sitemap) {
await timedRun(
this.parseSitemap(seed, i),
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
"Sitemap initial fetch timed out",
{ sitemap: seed.sitemap, seed: seed.url },
"sitemap",
);
}
}
}
async postCrawl() {
if (this.params.combineWARC) {
await this.combineWARC();
@ -1307,9 +1339,9 @@ self.__bx_behaviors.selectMainBehavior();
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
await this.crawlState.setStatus("generate-cdx");
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
const warcList = await fsp.readdir(this.archivesDir);
const warcListFull = warcList.map((filename) =>
path.join(this.collDir, "archive", filename),
path.join(this.archivesDir, filename),
);
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
@ -1377,10 +1409,8 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("Generating WACZ");
await this.crawlState.setStatus("generate-wacz");
const archiveDir = path.join(this.collDir, "archive");
// Get a list of the warcs inside
const warcFileList = await fsp.readdir(archiveDir);
const warcFileList = await fsp.readdir(this.archivesDir);
// is finished (>0 pages and all pages written)
const isFinished = await this.crawlState.isFinished();
@ -1440,7 +1470,9 @@ self.__bx_behaviors.selectMainBehavior();
createArgs.push("-f");
warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val)));
warcFileList.forEach((val) =>
createArgs.push(path.join(this.archivesDir, val)),
);
// create WACZ
const waczResult = await this.awaitProcess(
@ -1900,13 +1932,15 @@ self.__bx_behaviors.selectMainBehavior();
depth: number,
extraHops: number,
logDetails: LogDetails = {},
ts = 0,
pageid?: string,
) {
if (this.limitHit) {
return false;
}
const result = await this.crawlState.addToQueue(
{ url, seedId, depth, extraHops },
{ url, seedId, depth, extraHops, ts, pageid },
this.pageLimit,
);
@ -1954,7 +1988,7 @@ self.__bx_behaviors.selectMainBehavior();
id: "pages",
title: "All Pages",
};
header["hasText"] = this.params.text.includes("to-pages");
header["hasText"] = String(this.textInPages);
if (this.params.text.length) {
logger.debug("Text Extraction: " + this.params.text.join(","));
} else {
@ -1968,20 +2002,30 @@ self.__bx_behaviors.selectMainBehavior();
}
}
async writePage({
pageid,
url,
depth,
title,
text,
loadState,
mime,
favicon,
ts,
status,
}: PageState) {
const row: PageEntry = { id: pageid!, url, title, loadState };
protected pageEntryForRedis(
entry: Record<string, string | number | boolean | object>,
// eslint-disable-next-line @typescript-eslint/no-unused-vars
state: PageState,
) {
return entry;
}
async writePage(state: PageState) {
const {
pageid,
url,
depth,
title,
text,
loadState,
mime,
favicon,
status,
} = state;
const row: PageEntry = { id: pageid, url, title, loadState };
let { ts } = state;
if (!ts) {
ts = new Date();
logger.warn("Page date missing, setting to now", { url, ts });
@ -1998,14 +2042,16 @@ self.__bx_behaviors.selectMainBehavior();
}
if (this.params.writePagesToRedis) {
await this.crawlState.writeToPagesQueue(JSON.stringify(row));
await this.crawlState.writeToPagesQueue(
JSON.stringify(this.pageEntryForRedis(row, state)),
);
}
if (depth === 0) {
row.seed = true;
}
if (text) {
if (text && this.textInPages) {
row.text = text;
}
@ -2151,7 +2197,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.crawlState.setStatus("generate-warc");
// Get the list of created Warcs
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
const warcLists = await fsp.readdir(this.archivesDir);
logger.debug(`Combining ${warcLists.length} WARCs...`);
@ -2159,7 +2205,7 @@ self.__bx_behaviors.selectMainBehavior();
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
for (let i = 0; i < warcLists.length; i++) {
const fileName = path.join(this.collDir, "archive", warcLists[i]);
const fileName = path.join(this.archivesDir, warcLists[i]);
const fileSize = await getFileSize(fileName);
fileSizeObjects.push({ fileSize: fileSize, fileName: fileName });
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
@ -2316,6 +2362,21 @@ self.__bx_behaviors.selectMainBehavior();
await this.storage.uploadFile(filename, targetFilename);
}
}
createRecorder(id: number): Recorder | null {
if (!this.recording) {
return null;
}
const res = new Recorder({
workerid: id,
collDir: this.collDir,
crawler: this,
});
this.browser.recorders.push(res);
return res;
}
}
function shouldIgnoreAbort(req: HTTPRequest) {