QA Crawl Support (Beta) (#469)

Initial (beta) support for QA/replay crawling!
- Supports running a crawl over a given WACZ / list of WACZ (multi WACZ) input, hosted in ReplayWeb.page
- Runs local http server with full-page, ui-less ReplayWeb.page embed
- ReplayWeb.page release version configured in the Dockerfile, pinned ui.js and sw.js fetched directly from cdnjs

Can be deployed with `webrecorder/browsertrix-crawler qa` entrypoint.
- Requires `--qaSource`, pointing to WACZ or multi-WACZ json that will be replay/QAd
- Also supports `--qaRedisKey` where QA comparison data will be pushed, if specified.
- Supports `--qaDebugImageDiff` for outputting crawl / replay/ diff
images.
- If using --writePagesToRedis, a `comparison` key is added to existing page data where:
```
  comparison: {
    screenshotMatch?: number;
    textMatch?: number;
    resourceCounts: {
      crawlGood?: number;
      crawlBad?: number;
      replayGood?: number;
      replayBad?: number;
    };
  };
  ```
- bump version to 1.1.0-beta.2
This commit is contained in:
Ilya Kreymer 2024-03-22 17:32:42 -07:00 committed by GitHub
parent 22a7351dc7
commit bb9c82493b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 2068 additions and 598 deletions

View file

@ -42,10 +42,10 @@ jobs:
run: yarn run tsc run: yarn run tsc
- name: build docker - name: build docker
run: docker-compose build run: docker-compose build
- name: run jest - name: run all tests as root
run: sudo yarn test run: sudo yarn test
- name: run saved state test with volume owned by different user - name: run saved state + qa compare test as non-root - with volume owned by current user
run: | run: |
sudo rm -rf ./test-crawls sudo rm -rf ./test-crawls
mkdir test-crawls mkdir test-crawls
sudo yarn test ./tests/saved-state.test.js sudo yarn test ./tests/saved-state.test.js ./tests/qa_compare.test.js

View file

@ -48,9 +48,15 @@ ADD config/ /app/
ADD html/ /app/html/ ADD html/ /app/html/
RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js ARG RWP_VERSION=1.8.15
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
RUN ln -s /app/dist/main.js /usr/bin/crawl; ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/*
RUN ln -s /app/dist/main.js /usr/bin/crawl; \
ln -s /app/dist/main.js /usr/bin/qa; \
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
WORKDIR /crawls WORKDIR /crawls

39
html/replay.html Normal file
View file

@ -0,0 +1,39 @@
<!doctype html>
<html>
<head>
<script src="/ui.js"></script>
<style>
html {
width: 100%;
height: 100%;
display: flex;
}
body {
width: 100%;
margin: 0;
padding: 0;
}
replay-web-page {
margin: 0;
padding: 0;
border: 0;
position: fixed;
width: 100vw;
height: 100vh;
top: 0;
left: 0;
}
</style>
</head>
<body>
<replay-web-page
embed="replayonly"
deepLink="true"
source="$SOURCE"
url="about:blank"
ts=""
coll="replay"
>
</replay-web-page>
</body>
</html>

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "1.0.2", "version": "1.1.0-beta.2",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"type": "module", "type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",
@ -24,9 +24,12 @@
"get-folder-size": "^4.0.0", "get-folder-size": "^4.0.0",
"husky": "^8.0.3", "husky": "^8.0.3",
"ioredis": "^5.3.2", "ioredis": "^5.3.2",
"js-levenshtein": "^1.1.6",
"js-yaml": "^4.1.0", "js-yaml": "^4.1.0",
"minio": "^7.1.3", "minio": "^7.1.3",
"p-queue": "^7.3.4", "p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^20.8.2", "puppeteer-core": "^20.8.2",
"sax": "^1.3.0", "sax": "^1.3.0",
"sharp": "^0.32.6", "sharp": "^0.32.6",
@ -37,8 +40,11 @@
"yargs": "^17.7.2" "yargs": "^17.7.2"
}, },
"devDependencies": { "devDependencies": {
"@types/js-levenshtein": "^1.1.3",
"@types/js-yaml": "^4.0.8", "@types/js-yaml": "^4.0.8",
"@types/node": "^20.8.7", "@types/node": "^20.8.7",
"@types/pixelmatch": "^5.2.6",
"@types/pngjs": "^6.0.4",
"@types/uuid": "^9.0.6", "@types/uuid": "^9.0.6",
"@types/ws": "^8.5.8", "@types/ws": "^8.5.8",
"@typescript-eslint/eslint-plugin": "^6.10.0", "@typescript-eslint/eslint-plugin": "^6.10.0",
@ -46,7 +52,7 @@
"eslint": "^8.53.0", "eslint": "^8.53.0",
"eslint-config-prettier": "^9.0.0", "eslint-config-prettier": "^9.0.0",
"eslint-plugin-react": "^7.22.0", "eslint-plugin-react": "^7.22.0",
"jest": "^29.2.1", "jest": "^29.7.0",
"md5": "^2.3.0", "md5": "^2.3.0",
"prettier": "3.0.3", "prettier": "3.0.3",
"typescript": "^5.2.2" "typescript": "^5.2.2"

View file

@ -13,6 +13,8 @@ import {
PageCallbacks, PageCallbacks,
} from "./util/state.js"; } from "./util/state.js";
import { parseArgs } from "./util/argParser.js";
import yaml from "js-yaml"; import yaml from "js-yaml";
import * as warcio from "warcio"; import * as warcio from "warcio";
@ -29,7 +31,6 @@ import {
} from "./util/storage.js"; } from "./util/storage.js";
import { ScreenCaster, WSTransport } from "./util/screencaster.js"; import { ScreenCaster, WSTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js"; import { Screenshots } from "./util/screenshots.js";
import { parseArgs } from "./util/argParser.js";
import { initRedis } from "./util/redis.js"; import { initRedis } from "./util/redis.js";
import { logger, formatErr } from "./util/logger.js"; import { logger, formatErr } from "./util/logger.js";
import { import {
@ -57,6 +58,7 @@ import { OriginOverride } from "./util/originoverride.js";
import { Agent as HTTPAgent } from "http"; import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https"; import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core"; import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js"; import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js"; import { ScopedSeed } from "./util/seeds.js";
@ -146,6 +148,8 @@ export class Crawler {
pagesDir: string; pagesDir: string;
pagesFile: string; pagesFile: string;
archivesDir: string;
blockRules: BlockRules | null; blockRules: BlockRules | null;
adBlockRules: AdBlockRules | null; adBlockRules: AdBlockRules | null;
@ -154,11 +158,15 @@ export class Crawler {
screencaster: ScreenCaster | null = null; screencaster: ScreenCaster | null = null;
skipTextDocs = 0;
interrupted = false; interrupted = false;
finalExit = false; finalExit = false;
uploadAndDeleteLocal = false; uploadAndDeleteLocal = false;
done = false; done = false;
textInPages = false;
customBehaviors = ""; customBehaviors = "";
behaviorsChecked = false; behaviorsChecked = false;
behaviorLastLine?: string; behaviorLastLine?: string;
@ -178,10 +186,12 @@ export class Crawler {
crawler: Crawler; crawler: Crawler;
}) => NonNullable<unknown>; }) => NonNullable<unknown>;
recording = true;
constructor() { constructor() {
const res = parseArgs(); const args = this.parseArgs();
this.params = res.parsed; this.params = args.parsed;
this.origConfig = res.origConfig; this.origConfig = args.origConfig;
// root collections dir // root collections dir
this.collDir = path.join( this.collDir = path.join(
@ -259,6 +269,9 @@ export class Crawler {
// pages file // pages file
this.pagesFile = path.join(this.pagesDir, "pages.jsonl"); this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
// archives dir
this.archivesDir = path.join(this.collDir, "archive");
this.blockRules = null; this.blockRules = null;
this.adBlockRules = null; this.adBlockRules = null;
@ -268,6 +281,8 @@ export class Crawler {
this.finalExit = false; this.finalExit = false;
this.uploadAndDeleteLocal = false; this.uploadAndDeleteLocal = false;
this.textInPages = this.params.text.includes("to-pages");
this.done = false; this.done = false;
this.customBehaviors = ""; this.customBehaviors = "";
@ -281,6 +296,10 @@ export class Crawler {
} }
} }
protected parseArgs() {
return parseArgs();
}
configureUA() { configureUA() {
// override userAgent // override userAgent
if (this.params.userAgent) { if (this.params.userAgent) {
@ -434,7 +453,9 @@ export class Crawler {
// logger.info("wb-manager init failed, collection likely already exists"); // logger.info("wb-manager init failed, collection likely already exists");
//} //}
fs.mkdirSync(this.logDir, { recursive: true }); await fsp.mkdir(this.logDir, { recursive: true });
await fsp.mkdir(this.archivesDir, { recursive: true });
this.logFH = fs.createWriteStream(this.logFilename); this.logFH = fs.createWriteStream(this.logFilename);
logger.setExternalLogStream(this.logFH); logger.setExternalLogStream(this.logFH);
@ -721,10 +742,10 @@ self.__bx_behaviors.selectMainBehavior();
return ""; return "";
} }
async crawlPage(opts: WorkerState) { async crawlPage(opts: WorkerState): Promise<void> {
await this.writeStats(); await this.writeStats();
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts; const { page, data, workerid, callbacks, directFetchCapture } = opts;
data.callbacks = callbacks; data.callbacks = callbacks;
const { url } = data; const { url } = data;
@ -764,7 +785,7 @@ self.__bx_behaviors.selectMainBehavior();
{ url, ...logDetails }, { url, ...logDetails },
"fetch", "fetch",
); );
return true; return;
} }
} catch (e) { } catch (e) {
// filtered out direct fetch // filtered out direct fetch
@ -782,7 +803,14 @@ self.__bx_behaviors.selectMainBehavior();
data.title = await page.title(); data.title = await page.title();
data.favicon = await this.getFavicon(page, logDetails); data.favicon = await this.getFavicon(page, logDetails);
const archiveDir = path.join(this.collDir, "archive"); await this.doPostLoadActions(opts);
}
async doPostLoadActions(opts: WorkerState, saveOutput = false) {
const { page, cdp, data, workerid } = opts;
const { url } = data;
const logDetails = { page: url, workerid };
if (this.params.screenshot) { if (this.params.screenshot) {
if (!data.isHTMLPage) { if (!data.isHTMLPage) {
@ -793,10 +821,10 @@ self.__bx_behaviors.selectMainBehavior();
browser: this.browser, browser: this.browser,
page, page,
url, url,
directory: archiveDir, directory: this.archivesDir,
}); });
if (this.params.screenshot.includes("view")) { if (this.params.screenshot.includes("view")) {
await screenshots.take(); await screenshots.take("view", saveOutput ? data : null);
} }
if (this.params.screenshot.includes("fullPage")) { if (this.params.screenshot.includes("fullPage")) {
await screenshots.takeFullPage(); await screenshots.takeFullPage();
@ -812,15 +840,16 @@ self.__bx_behaviors.selectMainBehavior();
textextract = new TextExtractViaSnapshot(cdp, { textextract = new TextExtractViaSnapshot(cdp, {
warcPrefix: this.warcPrefix, warcPrefix: this.warcPrefix,
url, url,
directory: archiveDir, directory: this.archivesDir,
skipDocs: this.skipTextDocs,
}); });
const { changed, text } = await textextract.extractAndStoreText( const { text } = await textextract.extractAndStoreText(
"text", "text",
false, false,
this.params.text.includes("to-warc"), this.params.text.includes("to-warc"),
); );
if (changed && text && this.params.text.includes("to-pages")) { if (text && (this.textInPages || saveOutput)) {
data.text = text; data.text = text;
} }
} }
@ -868,8 +897,6 @@ self.__bx_behaviors.selectMainBehavior();
); );
await sleep(this.params.pageExtraDelay); await sleep(this.params.pageExtraDelay);
} }
return true;
} }
async pageFinished(data: PageState) { async pageFinished(data: PageState) {
@ -1047,8 +1074,7 @@ self.__bx_behaviors.selectMainBehavior();
async checkLimits() { async checkLimits() {
let interrupt = false; let interrupt = false;
const dir = path.join(this.collDir, "archive"); const size = await getDirSize(this.archivesDir);
const size = await getDirSize(dir);
await this.crawlState.setArchiveSize(size); await this.crawlState.setArchiveSize(size);
@ -1230,28 +1256,11 @@ self.__bx_behaviors.selectMainBehavior();
this.screencaster = this.initScreenCaster(); this.screencaster = this.initScreenCaster();
if (this.params.originOverride.length) { if (this.params.originOverride && this.params.originOverride.length) {
this.originOverride = new OriginOverride(this.params.originOverride); this.originOverride = new OriginOverride(this.params.originOverride);
} }
for (let i = 0; i < this.params.scopedSeeds.length; i++) { await this._addInitialSeeds();
const seed = this.params.scopedSeeds[i];
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
if (this.limitHit) {
break;
}
}
if (seed.sitemap) {
await timedRun(
this.parseSitemap(seed, i),
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
"Sitemap initial fetch timed out",
{ sitemap: seed.sitemap, seed: seed.url },
"sitemap",
);
}
}
await this.browser.launch({ await this.browser.launch({
profileUrl: this.params.profile, profileUrl: this.params.profile,
@ -1272,12 +1281,14 @@ self.__bx_behaviors.selectMainBehavior();
"browser", "browser",
); );
}, },
recording: this.recording,
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
} as any); } as any);
// -------------- // --------------
// Run Crawl Here! // Run Crawl Here!
await runWorkers(this, this.params.workers, this.maxPageTime, this.collDir); await runWorkers(this, this.params.workers, this.maxPageTime);
// -------------- // --------------
await this.serializeConfig(true); await this.serializeConfig(true);
@ -1297,6 +1308,27 @@ self.__bx_behaviors.selectMainBehavior();
await this.postCrawl(); await this.postCrawl();
} }
protected async _addInitialSeeds() {
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
const seed = this.params.scopedSeeds[i];
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
if (this.limitHit) {
break;
}
}
if (seed.sitemap) {
await timedRun(
this.parseSitemap(seed, i),
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
"Sitemap initial fetch timed out",
{ sitemap: seed.sitemap, seed: seed.url },
"sitemap",
);
}
}
}
async postCrawl() { async postCrawl() {
if (this.params.combineWARC) { if (this.params.combineWARC) {
await this.combineWARC(); await this.combineWARC();
@ -1307,9 +1339,9 @@ self.__bx_behaviors.selectMainBehavior();
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true }); await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
await this.crawlState.setStatus("generate-cdx"); await this.crawlState.setStatus("generate-cdx");
const warcList = await fsp.readdir(path.join(this.collDir, "archive")); const warcList = await fsp.readdir(this.archivesDir);
const warcListFull = warcList.map((filename) => const warcListFull = warcList.map((filename) =>
path.join(this.collDir, "archive", filename), path.join(this.archivesDir, filename),
); );
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd})); //const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
@ -1377,10 +1409,8 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("Generating WACZ"); logger.info("Generating WACZ");
await this.crawlState.setStatus("generate-wacz"); await this.crawlState.setStatus("generate-wacz");
const archiveDir = path.join(this.collDir, "archive");
// Get a list of the warcs inside // Get a list of the warcs inside
const warcFileList = await fsp.readdir(archiveDir); const warcFileList = await fsp.readdir(this.archivesDir);
// is finished (>0 pages and all pages written) // is finished (>0 pages and all pages written)
const isFinished = await this.crawlState.isFinished(); const isFinished = await this.crawlState.isFinished();
@ -1440,7 +1470,9 @@ self.__bx_behaviors.selectMainBehavior();
createArgs.push("-f"); createArgs.push("-f");
warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val))); warcFileList.forEach((val) =>
createArgs.push(path.join(this.archivesDir, val)),
);
// create WACZ // create WACZ
const waczResult = await this.awaitProcess( const waczResult = await this.awaitProcess(
@ -1900,13 +1932,15 @@ self.__bx_behaviors.selectMainBehavior();
depth: number, depth: number,
extraHops: number, extraHops: number,
logDetails: LogDetails = {}, logDetails: LogDetails = {},
ts = 0,
pageid?: string,
) { ) {
if (this.limitHit) { if (this.limitHit) {
return false; return false;
} }
const result = await this.crawlState.addToQueue( const result = await this.crawlState.addToQueue(
{ url, seedId, depth, extraHops }, { url, seedId, depth, extraHops, ts, pageid },
this.pageLimit, this.pageLimit,
); );
@ -1954,7 +1988,7 @@ self.__bx_behaviors.selectMainBehavior();
id: "pages", id: "pages",
title: "All Pages", title: "All Pages",
}; };
header["hasText"] = this.params.text.includes("to-pages"); header["hasText"] = String(this.textInPages);
if (this.params.text.length) { if (this.params.text.length) {
logger.debug("Text Extraction: " + this.params.text.join(",")); logger.debug("Text Extraction: " + this.params.text.join(","));
} else { } else {
@ -1968,20 +2002,30 @@ self.__bx_behaviors.selectMainBehavior();
} }
} }
async writePage({ protected pageEntryForRedis(
pageid, entry: Record<string, string | number | boolean | object>,
url, // eslint-disable-next-line @typescript-eslint/no-unused-vars
depth, state: PageState,
title, ) {
text, return entry;
loadState, }
mime,
favicon,
ts,
status,
}: PageState) {
const row: PageEntry = { id: pageid!, url, title, loadState };
async writePage(state: PageState) {
const {
pageid,
url,
depth,
title,
text,
loadState,
mime,
favicon,
status,
} = state;
const row: PageEntry = { id: pageid, url, title, loadState };
let { ts } = state;
if (!ts) { if (!ts) {
ts = new Date(); ts = new Date();
logger.warn("Page date missing, setting to now", { url, ts }); logger.warn("Page date missing, setting to now", { url, ts });
@ -1998,14 +2042,16 @@ self.__bx_behaviors.selectMainBehavior();
} }
if (this.params.writePagesToRedis) { if (this.params.writePagesToRedis) {
await this.crawlState.writeToPagesQueue(JSON.stringify(row)); await this.crawlState.writeToPagesQueue(
JSON.stringify(this.pageEntryForRedis(row, state)),
);
} }
if (depth === 0) { if (depth === 0) {
row.seed = true; row.seed = true;
} }
if (text) { if (text && this.textInPages) {
row.text = text; row.text = text;
} }
@ -2151,7 +2197,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.crawlState.setStatus("generate-warc"); await this.crawlState.setStatus("generate-warc");
// Get the list of created Warcs // Get the list of created Warcs
const warcLists = await fsp.readdir(path.join(this.collDir, "archive")); const warcLists = await fsp.readdir(this.archivesDir);
logger.debug(`Combining ${warcLists.length} WARCs...`); logger.debug(`Combining ${warcLists.length} WARCs...`);
@ -2159,7 +2205,7 @@ self.__bx_behaviors.selectMainBehavior();
// Go through a list of the created works and create an array sorted by their filesize with the largest file first. // Go through a list of the created works and create an array sorted by their filesize with the largest file first.
for (let i = 0; i < warcLists.length; i++) { for (let i = 0; i < warcLists.length; i++) {
const fileName = path.join(this.collDir, "archive", warcLists[i]); const fileName = path.join(this.archivesDir, warcLists[i]);
const fileSize = await getFileSize(fileName); const fileSize = await getFileSize(fileName);
fileSizeObjects.push({ fileSize: fileSize, fileName: fileName }); fileSizeObjects.push({ fileSize: fileSize, fileName: fileName });
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize); fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
@ -2316,6 +2362,21 @@ self.__bx_behaviors.selectMainBehavior();
await this.storage.uploadFile(filename, targetFilename); await this.storage.uploadFile(filename, targetFilename);
} }
} }
createRecorder(id: number): Recorder | null {
if (!this.recording) {
return null;
}
const res = new Recorder({
workerid: id,
collDir: this.collDir,
crawler: this,
});
this.browser.recorders.push(res);
return res;
}
} }
function shouldIgnoreAbort(req: HTTPRequest) { function shouldIgnoreAbort(req: HTTPRequest) {

View file

@ -186,6 +186,7 @@ async function main() {
"--test-type", "--test-type",
], ],
}, },
recording: false,
}); });
if (params.interactive) { if (params.interactive) {

View file

@ -3,6 +3,7 @@
import { logger } from "./util/logger.js"; import { logger } from "./util/logger.js";
import { setExitOnRedisError } from "./util/redis.js"; import { setExitOnRedisError } from "./util/redis.js";
import { Crawler } from "./crawler.js"; import { Crawler } from "./crawler.js";
import { ReplayCrawler } from "./replaycrawler.js";
let crawler: Crawler | null = null; let crawler: Crawler | null = null;
@ -49,5 +50,10 @@ process.on("SIGABRT", async () => {
forceTerm = true; forceTerm = true;
}); });
crawler = new Crawler(); if (process.argv[1].endsWith("qa")) {
crawler = new ReplayCrawler();
} else {
crawler = new Crawler();
}
crawler.run(); crawler.run();

731
src/replaycrawler.ts Normal file
View file

@ -0,0 +1,731 @@
import { Page, Protocol } from "puppeteer-core";
import { Crawler } from "./crawler.js";
import { ReplayServer } from "./util/replayserver.js";
import { sleep } from "./util/timing.js";
import { logger } from "./util/logger.js";
import { WorkerOpts, WorkerState } from "./util/worker.js";
import { PageState } from "./util/state.js";
import { PageInfoRecord, PageInfoValue, Recorder } from "./util/recorder.js";
import fsp from "fs/promises";
import path from "path";
// @ts-expect-error wabac.js
import { ZipRangeReader } from "@webrecorder/wabac/src/wacz/ziprangereader.js";
// @ts-expect-error wabac.js
import { createLoader } from "@webrecorder/wabac/src/blockloaders.js";
import { AsyncIterReader } from "warcio";
import { WARCResourceWriter } from "./util/warcresourcewriter.js";
import { parseArgs } from "./util/argParser.js";
import { PNG } from "pngjs";
import pixelmatch from "pixelmatch";
import levenshtein from "js-levenshtein";
import { MAX_URL_LENGTH } from "./util/reqresp.js";
import { openAsBlob } from "fs";
// RWP Replay Prefix
const REPLAY_PREFIX = "http://localhost:9990/replay/w/replay/";
// RWP Source Url
const REPLAY_SOURCE = "http://localhost:9990/replay/?source=";
// When iterating over page.frames(), the first two frames are for the top-level page
// and RWP embed, the actual content starts with frame index 2
const SKIP_FRAMES = 2;
type ReplayPage = {
url: string;
ts: number;
id: string;
};
type ComparisonData = {
comparison: {
screenshotMatch?: number;
textMatch?: number;
resourceCounts: {
crawlGood?: number;
crawlBad?: number;
replayGood?: number;
replayBad?: number;
};
};
};
type ReplayPageInfoRecord = PageInfoRecord & ComparisonData;
type ComparisonPageState = PageState & ComparisonData;
// ============================================================================
// Crawler designed to run over replay of existing WACZ files to generate comparison
// data (eg. for QA)
export class ReplayCrawler extends Crawler {
replayServer: ReplayServer;
qaSource: string;
pageInfos: Map<Page, ReplayPageInfoRecord>;
reloadTimeouts: WeakMap<Page, NodeJS.Timeout>;
constructor() {
super();
this.recording = false;
if (!this.params.qaSource) {
throw new Error("Missing QA source");
}
this.qaSource = this.params.qaSource;
this.replayServer = new ReplayServer(this.qaSource);
logger.info(
"Replay Crawl with Source",
{ source: this.qaSource },
"general",
);
this.pageInfos = new Map<Page, ReplayPageInfoRecord>();
// skip text from first two frames, as they are RWP boilerplate
this.skipTextDocs = SKIP_FRAMES;
this.params.scopedSeeds = [];
this.params.screenshot = ["view"];
this.params.text = ["to-warc"];
this.params.serviceWorker = "enabled";
this.reloadTimeouts = new WeakMap<Page, NodeJS.Timeout>();
}
protected parseArgs() {
return parseArgs(process.argv, true);
}
async setupPage(opts: WorkerState) {
await super.setupPage(opts);
const { page, cdp } = opts;
if (!this.qaSource) {
throw new Error("Missing QA source");
}
await cdp.send("Network.enable");
cdp.on("Network.responseReceived", async (params) =>
this.handlePageResourceResponse(params, page),
);
cdp.on("Network.requestWillBeSent", (params) =>
this.handleRequestWillBeSent(params, page),
);
await page.goto(this.replayServer.homePage);
// wait until content frame is available
while (page.frames().length < SKIP_FRAMES) {
await sleep(5);
}
const frame = page.frames()[1];
await frame.evaluate(() => {
return navigator.serviceWorker.ready;
});
}
protected async _addInitialSeeds() {
await this.loadPages(this.qaSource);
}
isInScope() {
return true;
}
async loadPages(url: string) {
let path = url;
try {
path = new URL(url).pathname;
} catch (e) {
// ignore
}
if (path.endsWith(".wacz")) {
await this.loadPagesForWACZ(url);
} else if (path.endsWith(".json")) {
if (!url.startsWith("http://") && !url.startsWith("https://")) {
const blob = await openAsBlob(url);
url = URL.createObjectURL(blob);
}
const resp = await fetch(url);
const json = await resp.json();
// if json contains pages, just load them directly
if (json.pages) {
await this.loadPagesDirect(json.pages);
} else {
// otherwise, parse pages from WACZ files
for (const entry of json.resources) {
if (entry.path) {
await this.loadPages(entry.path);
}
}
}
} else {
logger.warn("Unknown replay source", { url }, "replay");
}
}
async loadPagesForWACZ(url: string) {
const loader = new WACZLoader(url);
await loader.init();
let count = 0;
const pagesReader = await loader.loadFile("pages/pages.jsonl");
if (pagesReader) {
for await (const buff of pagesReader.iterLines()) {
await this.addPage(buff, count++);
if (this.limitHit) {
break;
}
}
}
const extraPagesReader = await loader.loadFile("pages/extraPages.jsonl");
if (extraPagesReader) {
for await (const buff of extraPagesReader.iterLines()) {
await this.addPage(buff, count++);
if (this.limitHit) {
break;
}
}
}
}
async loadPagesDirect(pages: ReplayPage[]) {
let depth = 0;
for (const entry of pages) {
const { url, ts, id } = entry;
if (!url) {
continue;
}
if (this.limitHit) {
break;
}
await this.queueUrl(0, url, depth++, 0, {}, ts, id);
}
}
async addPage(page: string, depth: number) {
let pageData: ReplayPage;
if (!page.length) {
return;
}
try {
pageData = JSON.parse(page);
} catch (e) {
console.log(page, e);
return;
}
const { url, ts, id } = pageData;
if (!url) {
return;
}
await this.queueUrl(0, url, depth, 0, {}, ts, id);
}
extraChromeArgs(): string[] {
return [...super.extraChromeArgs(), "--disable-web-security"];
}
handleRequestWillBeSent(
params: Protocol.Network.RequestWillBeSentEvent,
page: Page,
) {
// only handling redirect here, committing last response in redirect chain
const { redirectResponse, type } = params;
if (redirectResponse) {
const { url, status, mimeType } = redirectResponse;
this.addPageResource(url, page, { status, mime: mimeType, type });
}
}
async handlePageResourceResponse(
params: Protocol.Network.ResponseReceivedEvent,
page: Page,
) {
const { response } = params;
const { url, status } = response;
if (!url.startsWith(REPLAY_PREFIX)) {
if (url.startsWith(REPLAY_SOURCE)) {
const { mimeType, fromServiceWorker } = response;
if (
!fromServiceWorker &&
mimeType === "application/json" &&
page.frames().length > 1
) {
const frame = page.frames()[1];
const timeoutid = setTimeout(() => {
logger.warn("Reloading RWP Frame, not inited", { url }, "replay");
frame.evaluate("window.location.reload();");
}, 10000);
this.reloadTimeouts.set(page, timeoutid);
} else if (fromServiceWorker && mimeType !== "application/json") {
const timeoutid = this.reloadTimeouts.get(page);
if (timeoutid) {
clearTimeout(timeoutid);
this.reloadTimeouts.delete(page);
}
}
}
return;
}
const { type } = params;
const { mimeType } = response;
this.addPageResource(url, page, { status, mime: mimeType, type });
}
addPageResource(
url: string,
page: Page,
{ status, mime, type }: PageInfoValue,
) {
const inx = url.indexOf("_/");
if (inx <= 0) {
return;
}
let replayUrl = url.slice(inx + 2, MAX_URL_LENGTH);
const pageInfo = this.pageInfos.get(page);
if (!pageInfo) {
return;
}
if (replayUrl.startsWith("//")) {
try {
replayUrl = new URL(replayUrl, pageInfo.url).href;
} catch (e) {
//
}
}
if (replayUrl.startsWith("http://") || replayUrl.startsWith("https://")) {
pageInfo.urls[replayUrl] = { status, mime, type };
}
}
async crawlPage(opts: WorkerState): Promise<void> {
await this.writeStats();
const { page, data } = opts;
const { url, ts, pageid } = data;
if (!ts) {
return;
}
const date = new Date(ts);
const timestamp = date.toISOString().slice(0, 19).replace(/[T:-]/g, "");
logger.info("Loading Replay", { url, timestamp }, "replay");
const pageInfo = {
pageid,
urls: {},
url,
ts: date,
comparison: { resourceCounts: {} },
counts: { jsErrors: 0 },
};
this.pageInfos.set(page, pageInfo);
await page.evaluate(
(url, ts) => {
const rwp = document.querySelector("replay-web-page");
if (!rwp) {
return;
}
const p = new Promise<void>((resolve) => {
window.addEventListener(
"message",
(e) => {
if (e.data && e.data.url && e.data.view) {
resolve();
}
},
{ once: true },
);
});
rwp.setAttribute("url", url);
rwp.setAttribute("ts", ts ? ts : "");
return p;
},
url,
timestamp,
);
// optionally reload (todo: reevaluate if this is needed)
// await page.reload();
await sleep(10);
data.isHTMLPage = true;
// skipping RWP frames
data.filteredFrames = page.frames().slice(SKIP_FRAMES);
try {
data.title = await data.filteredFrames[0].title();
} catch (e) {
// ignore
}
data.favicon = await this.getFavicon(page, {});
await this.doPostLoadActions(opts, true);
await this.compareScreenshots(page, data, url, date);
await this.compareText(page, data, url, date);
await this.compareResources(page, data, url, date);
await this.processPageInfo(page, data);
}
async compareScreenshots(
page: Page,
state: PageState,
url: string,
date?: Date,
) {
const origScreenshot = await this.fetchOrigBinary(
page,
"view",
url,
date ? date.toISOString().replace(/[^\d]/g, "") : "",
);
const { pageid, screenshotView } = state;
if (!origScreenshot || !origScreenshot.length) {
logger.warn("Orig screenshot missing for comparison", { url }, "replay");
return;
}
if (!screenshotView || !screenshotView.length) {
logger.warn(
"Replay screenshot missing for comparison",
{ url },
"replay",
);
return;
}
const crawl = PNG.sync.read(origScreenshot);
const replay = PNG.sync.read(screenshotView);
const { width, height } = replay;
const diff = new PNG({ width, height });
const res = pixelmatch(crawl.data, replay.data, diff.data, width, height, {
threshold: 0.1,
alpha: 0,
});
const total = width * height;
const matchPercent = (total - res) / total;
logger.info(
"Screenshot Diff",
{
url,
diff: res,
matchPercent,
},
"replay",
);
if (res && this.params.qaDebugImageDiff) {
const dir = path.join(this.collDir, "screenshots", pageid || "unknown");
await fsp.mkdir(dir, { recursive: true });
await fsp.writeFile(path.join(dir, "crawl.png"), PNG.sync.write(crawl));
await fsp.writeFile(path.join(dir, "replay.png"), PNG.sync.write(replay));
await fsp.writeFile(path.join(dir, "diff.png"), PNG.sync.write(diff));
}
const pageInfo = this.pageInfos.get(page);
if (pageInfo) {
pageInfo.comparison.screenshotMatch = matchPercent;
}
}
async compareText(page: Page, state: PageState, url: string, date?: Date) {
const origText = await this.fetchOrigText(
page,
"text",
url,
date ? date.toISOString().replace(/[^\d]/g, "") : "",
);
const replayText = state.text;
if (!origText || !replayText) {
logger.warn(
"Text missing for comparison",
{
url,
origTextLen: origText?.length,
replayTextLen: replayText?.length,
},
"replay",
);
return;
}
const dist = levenshtein(origText, replayText);
const maxLen = Math.max(origText.length, replayText.length);
const matchPercent = (maxLen - dist) / maxLen;
logger.info("Levenshtein Dist", { url, dist, matchPercent, maxLen });
const pageInfo = this.pageInfos.get(page);
if (pageInfo) {
pageInfo.comparison.textMatch = matchPercent;
}
}
async compareResources(
page: Page,
state: PageState,
url: string,
date?: Date,
) {
const origResources = await this.fetchOrigText(
page,
"pageinfo",
url,
date ? date.toISOString().replace(/[^\d]/g, "") : "",
);
let origResData: PageInfoRecord | null;
try {
origResData = JSON.parse(origResources || "");
} catch (e) {
origResData = null;
}
const pageInfo: ReplayPageInfoRecord | undefined = this.pageInfos.get(page);
if (!origResData) {
logger.warn("Original resources missing / invalid", { url }, "replay");
return;
}
if (!pageInfo) {
logger.warn("Replay resources missing / invalid", { url }, "replay");
return;
}
if (origResData.ts) {
pageInfo.ts = origResData.ts;
}
const { resourceCounts } = pageInfo.comparison;
const { good: crawlGood, bad: crawlBad } = this.countResources(origResData);
const { good: replayGood, bad: replayBad } = this.countResources(pageInfo);
resourceCounts.crawlGood = crawlGood;
resourceCounts.crawlBad = crawlBad;
resourceCounts.replayGood = replayGood;
resourceCounts.replayBad = replayBad;
logger.info("Resource counts", { url, ...resourceCounts }, "replay");
}
countResources(info: PageInfoRecord) {
let good = 0;
let bad = 0;
for (const [url, { status }] of Object.entries(info.urls)) {
if (!url.startsWith("http")) {
continue;
}
if (url.indexOf("__wb_method") !== -1) {
continue;
}
if (status >= 400) {
bad++;
} else {
good++;
}
}
return { bad, good };
}
async fetchOrigBinary(page: Page, type: string, url: string, ts: string) {
const frame = page.frames()[1];
if (!frame) {
logger.warn("Replay frame missing", { url }, "replay");
return;
}
const replayUrl = REPLAY_PREFIX + `${ts}mp_/urn:${type}:${url}`;
const binaryString = await frame.evaluate(async (url) => {
const response = await fetch(url, {
method: "GET",
credentials: "include",
});
if (response.status !== 200) {
return "";
}
const blob = await response.blob();
const result = new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result);
reader.onerror = reject;
reader.readAsBinaryString(blob);
});
return result;
}, replayUrl);
if (!binaryString) {
logger.warn("Couldn't fetch original data", { type, url, ts }, "replay");
}
return Buffer.from(binaryString as string, "binary");
}
async fetchOrigText(page: Page, type: string, url: string, ts: string) {
const frame = page.frames()[1];
if (!frame) {
logger.warn("Replay frame missing", { url }, "replay");
return;
}
const replayUrl = REPLAY_PREFIX + `${ts}mp_/urn:${type}:${url}`;
const text = await frame.evaluate(async (url) => {
const response = await fetch(url, {
method: "GET",
credentials: "include",
});
if (response.status !== 200) {
return "";
}
return await response.text();
}, replayUrl);
if (!text) {
logger.warn("Couldn't fetch original data", { type, url, ts }, "replay");
}
return text;
}
async teardownPage(opts: WorkerOpts) {
const { page } = opts;
await this.processPageInfo(page);
await super.teardownPage(opts);
}
async processPageInfo(page: Page, state?: PageState) {
const pageInfo = this.pageInfos.get(page);
if (pageInfo) {
if (!pageInfo.urls[pageInfo.url]) {
logger.warn(
"Replay resource: missing top-level page",
{ url: pageInfo.url },
"replay",
);
}
if (state) {
const { comparison } = pageInfo;
// add comparison to page state
(state as ComparisonPageState).comparison = comparison;
}
const writer = new WARCResourceWriter({
url: pageInfo.url,
directory: this.archivesDir,
warcPrefix: this.warcPrefix,
date: new Date(),
warcName: "info.warc.gz",
});
await writer.writeBufferToWARC(
new TextEncoder().encode(JSON.stringify(pageInfo, null, 2)),
"pageinfo",
"application/json",
);
this.pageInfos.delete(page);
}
}
protected pageEntryForRedis(
entry: Record<string, string | number | boolean | object>,
state: PageState,
) {
entry.comparison = (state as ComparisonPageState).comparison;
return entry;
}
createRecorder(): Recorder | null {
return null;
}
}
class WACZLoader {
url: string;
zipreader: ZipRangeReader;
constructor(url: string) {
this.url = url;
this.zipreader = null;
}
async init() {
if (!this.url.startsWith("http://") && !this.url.startsWith("https://")) {
const blob = await openAsBlob(this.url);
this.url = URL.createObjectURL(blob);
}
const loader = await createLoader({ url: this.url });
this.zipreader = new ZipRangeReader(loader);
}
async loadFile(fileInZip: string) {
const { reader } = await this.zipreader.loadFile(fileInZip);
if (!reader) {
return null;
}
if (!reader.iterLines) {
return new AsyncIterReader(reader);
}
return reader;
}
}

View file

@ -536,10 +536,21 @@ class ArgParser {
choices: SERVICE_WORKER_OPTS, choices: SERVICE_WORKER_OPTS,
default: "disabled", default: "disabled",
}, },
qaSource: {
describe: "Required for QA mode. Source (WACZ or multi WACZ) for QA",
type: "string",
},
qaDebugImageDiff: {
describe:
"if specified, will write crawl.png, replay.png and diff.png for each page where they're different",
type: "boolean",
},
}; };
} }
parseArgs(argvParams?: string[]) { parseArgs(argvParams?: string[], isQA = false) {
let argv = argvParams || process.argv; let argv = argvParams || process.argv;
if (process.env.CRAWL_ARGS) { if (process.env.CRAWL_ARGS) {
@ -563,7 +574,7 @@ class ArgParser {
return origConfig; return origConfig;
}, },
) )
.check((argv) => this.validateArgs(argv)).argv; .check((argv) => this.validateArgs(argv, isQA)).argv;
return { parsed, origConfig }; return { parsed, origConfig };
} }
@ -576,7 +587,7 @@ class ArgParser {
} }
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
validateArgs(argv: Record<string, any>) { validateArgs(argv: Record<string, any>, isQA: boolean) {
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname; argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
argv.collection = interpolateFilename(argv.collection, argv.crawlId); argv.collection = interpolateFilename(argv.collection, argv.crawlId);
@ -631,33 +642,39 @@ class ArgParser {
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`); //logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
} }
const scopeOpts = {
scopeType: argv.scopeType,
sitemap: argv.sitemap,
include: argv.include,
exclude: argv.exclude,
depth: argv.depth,
extraHops: argv.extraHops,
};
argv.scopedSeeds = []; argv.scopedSeeds = [];
for (let seed of argv.seeds) { if (!isQA) {
if (typeof seed === "string") { const scopeOpts = {
seed = { url: seed }; scopeType: argv.scopeType,
} sitemap: argv.sitemap,
include: argv.include,
exclude: argv.exclude,
depth: argv.depth,
extraHops: argv.extraHops,
};
try { for (let seed of argv.seeds) {
argv.scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...seed })); if (typeof seed === "string") {
} catch (e) { seed = { url: seed };
if (argv.failOnFailedSeed) { }
logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`);
try {
argv.scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...seed }));
} catch (e) {
if (argv.failOnFailedSeed) {
logger.fatal(
`Invalid Seed "${seed.url}" specified, aborting crawl.`,
);
}
} }
} }
}
if (!argv.scopedSeeds.length) { if (!argv.scopedSeeds.length) {
logger.fatal("No valid seeds specified, aborting crawl."); logger.fatal("No valid seeds specified, aborting crawl.");
}
} else if (!argv.qaSource) {
logger.fatal("--qaSource required for QA mode!");
} }
// Resolve statsFilename // Resolve statsFilename
@ -673,6 +690,6 @@ class ArgParser {
} }
} }
export function parseArgs(argv?: string[]) { export function parseArgs(argv?: string[], isQA = false) {
return new ArgParser().parseArgs(argv); return new ArgParser().parseArgs(argv, isQA);
} }

View file

@ -19,22 +19,27 @@ import puppeteer, {
Viewport, Viewport,
} from "puppeteer-core"; } from "puppeteer-core";
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
import { Recorder } from "./recorder.js";
type BtrixChromeOpts = {
proxy?: boolean;
userAgent?: string | null;
extraArgs?: string[];
};
type LaunchOpts = { type LaunchOpts = {
profileUrl: string; profileUrl: string;
// TODO: Fix this the next time the file is edited. chromeOptions: BtrixChromeOpts;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
chromeOptions: Record<string, any>;
signals: boolean; signals: boolean;
headless: boolean; headless: boolean;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
emulateDevice?: Record<string, any>; emulateDevice?: Record<string, any>;
// TODO: Fix this the next time the file is edited. ondisconnect?: ((err: unknown) => NonNullable<unknown>) | null;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
ondisconnect?: ((err: any) => NonNullable<unknown>) | null;
swOpt?: ServiceWorkerOpt; swOpt?: ServiceWorkerOpt;
recording: boolean;
}; };
// ================================================================== // ==================================================================
@ -48,9 +53,7 @@ export class Browser {
browser?: PptrBrowser | null = null; browser?: PptrBrowser | null = null;
firstCDP: CDPSession | null = null; firstCDP: CDPSession | null = null;
// TODO: Fix this the next time the file is edited. recorders: Recorder[] = [];
// eslint-disable-next-line @typescript-eslint/no-explicit-any
recorders: any[] = [];
swOpt?: ServiceWorkerOpt = "disabled"; swOpt?: ServiceWorkerOpt = "disabled";
@ -66,6 +69,7 @@ export class Browser {
emulateDevice = {}, emulateDevice = {},
swOpt = "disabled", swOpt = "disabled",
ondisconnect = null, ondisconnect = null,
recording = true,
}: LaunchOpts) { }: LaunchOpts) {
if (this.isLaunched()) { if (this.isLaunched()) {
return; return;
@ -105,7 +109,7 @@ export class Browser {
userDataDir: this.profileDir, userDataDir: this.profileDir,
}; };
await this._init(launchOpts, ondisconnect); await this._init(launchOpts, ondisconnect, recording);
} }
async setupPage({ page }: { page: Page; cdp: CDPSession }) { async setupPage({ page }: { page: Page; cdp: CDPSession }) {
@ -116,13 +120,13 @@ export class Browser {
switch (this.swOpt) { switch (this.swOpt) {
case "disabled": case "disabled":
logger.info("Service Workers: always disabled", {}, "browser"); logger.debug("Service Workers: always disabled", {}, "browser");
await page.setBypassServiceWorker(true); await page.setBypassServiceWorker(true);
break; break;
case "disabled-if-profile": case "disabled-if-profile":
if (this.customProfile) { if (this.customProfile) {
logger.info( logger.debug(
"Service Workers: disabled since using profile", "Service Workers: disabled since using profile",
{}, {},
"browser", "browser",
@ -132,7 +136,7 @@ export class Browser {
break; break;
case "enabled": case "enabled":
logger.info("Service Workers: always enabled", {}, "browser"); logger.debug("Service Workers: always enabled", {}, "browser");
break; break;
} }
} }
@ -195,7 +199,11 @@ export class Browser {
}); });
} }
chromeArgs({ proxy = true, userAgent = null, extraArgs = [] } = {}) { chromeArgs({
proxy = true,
userAgent = null,
extraArgs = [],
}: BtrixChromeOpts) {
// Chrome Flags, including proxy server // Chrome Flags, including proxy server
const args = [ const args = [
// eslint-disable-next-line no-use-before-define // eslint-disable-next-line no-use-before-define
@ -347,6 +355,7 @@ export class Browser {
launchOpts: PuppeteerLaunchOptions, launchOpts: PuppeteerLaunchOptions,
// eslint-disable-next-line @typescript-eslint/ban-types // eslint-disable-next-line @typescript-eslint/ban-types
ondisconnect: Function | null = null, ondisconnect: Function | null = null,
recording: boolean,
) { ) {
this.browser = await puppeteer.launch(launchOpts); this.browser = await puppeteer.launch(launchOpts);
@ -354,7 +363,9 @@ export class Browser {
this.firstCDP = await target.createCDPSession(); this.firstCDP = await target.createCDPSession();
await this.serviceWorkerFetch(); if (recording) {
await this.serviceWorkerFetch();
}
if (ondisconnect) { if (ondisconnect) {
this.browser.on("disconnected", (err) => ondisconnect(err)); this.browser.on("disconnected", (err) => ondisconnect(err));
@ -497,8 +508,6 @@ export class Browser {
}); });
} }
// TODO: Fix this the next time the file is edited.
async evaluateWithCLI( async evaluateWithCLI(
_: unknown, _: unknown,
frame: Frame, frame: Frame,

View file

@ -48,6 +48,7 @@ export const LOG_CONTEXT_TYPES = [
"crawlStatus", "crawlStatus",
"links", "links",
"sitemap", "sitemap",
"replay",
] as const; ] as const;
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];

146
src/util/replayserver.ts Normal file
View file

@ -0,0 +1,146 @@
import fs from "fs";
import fsp from "fs/promises";
import http, { IncomingMessage, ServerResponse } from "http";
import path from "path";
const replayHTML = fs.readFileSync(
new URL("../../html/replay.html", import.meta.url),
{ encoding: "utf8" },
);
const swJS = fs.readFileSync(new URL("../../html/rwp/sw.js", import.meta.url), {
encoding: "utf8",
});
const uiJS = fs.readFileSync(new URL("../../html/rwp/ui.js", import.meta.url), {
encoding: "utf8",
});
// ============================================================================
const PORT = 9990;
// ============================================================================
export class ReplayServer {
sourceUrl: string;
origFileSource: string | null;
sourceContentType: string | null;
sourceSize?: number;
constructor(sourceUrlOrFile: string) {
if (
sourceUrlOrFile.startsWith("http://") ||
sourceUrlOrFile.startsWith("https://")
) {
this.sourceUrl = sourceUrlOrFile;
this.origFileSource = null;
this.sourceContentType = null;
} else {
this.origFileSource = sourceUrlOrFile;
const ext = path.extname(sourceUrlOrFile);
this.sourceUrl = `/source${ext}`;
switch (ext) {
case ".wacz":
this.sourceContentType = "application/wacz+zip";
break;
case ".json":
this.sourceContentType = "application/json";
break;
default:
this.sourceContentType = "application/octet-stream";
}
}
const httpServer = http.createServer((req, res) =>
this.handleRequest(req, res),
);
httpServer.listen(PORT);
}
get homePage() {
return `http://localhost:${PORT}/`;
}
async handleRequest(request: IncomingMessage, response: ServerResponse) {
const parsedUrl = new URL(
request.url || "",
`http://${request.headers.host}`,
);
const pathname = parsedUrl.pathname;
switch (pathname) {
case "/":
response.writeHead(200, { "Content-Type": "text/html" });
response.end(replayHTML.replace("$SOURCE", this.sourceUrl));
return;
case "/sw.js":
case "/sw.js?serveIndex=1":
case "/replay/sw.js":
case "/replay/sw.js?serveIndex=1":
response.writeHead(200, { "Content-Type": "application/javascript" });
response.end(swJS);
return;
case "/ui.js":
response.writeHead(200, { "Content-Type": "application/javascript" });
response.end(uiJS);
return;
case this.sourceUrl:
if (this.sourceContentType && this.origFileSource) {
if (!this.sourceSize) {
const { size } = await fsp.stat(this.origFileSource);
this.sourceSize = size;
}
const { opts, status, contentRange, contentLength } =
this.getRespOptsForRequest(request, this.sourceSize);
response.writeHead(status, {
"Accept-Ranges": "bytes",
"Content-Type": this.sourceContentType,
"Content-Length": contentLength,
"Content-Range": contentRange,
});
//console.log(request.method, contentRange, opts);
if (request.method === "GET") {
fs.createReadStream(this.origFileSource, opts).pipe(response);
} else {
response.end();
}
break;
}
// falls through
default:
response.writeHead(404, { "Content-Type": "application/json" });
response.end(JSON.stringify({ error: "not_found" }));
return;
}
}
getRespOptsForRequest(request: IncomingMessage, total: number) {
const range = request.headers["range"] || "";
const array = range.match(/bytes=(\d+)-(\d*)/);
let contentRange = undefined;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const opts: Record<string, any> = {};
if (array) {
opts.start = parseInt(array[1]);
opts.end = parseInt(array[2]);
if (isNaN(opts.end)) {
opts.end = undefined;
}
const end = opts.end || total - 1;
contentRange = `bytes ${opts.start}-${end}/${total}`;
return {
status: 206,
opts,
contentRange,
contentLength: end - opts.start + 1,
};
}
return { status: 200, opts, contentRange, contentLength: total };
}
}

View file

@ -3,48 +3,63 @@ import sharp from "sharp";
import { WARCResourceWriter } from "./warcresourcewriter.js"; import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger, formatErr } from "./logger.js"; import { logger, formatErr } from "./logger.js";
import { Browser } from "./browser.js"; import { Browser } from "./browser.js";
import { Page } from "puppeteer-core";
import { PageState } from "./state.js";
// ============================================================================ // ============================================================================
type ScreenShotType = { type ScreenShotDesc = {
type: string; type: "png" | "jpeg";
omitBackground: boolean; omitBackground: boolean;
fullPage: boolean; fullPage: boolean;
encoding: "binary";
}; };
export const screenshotTypes: Record<string, ScreenShotType> = { type ScreeshotType = "view" | "thumbnail" | "fullPage";
export const screenshotTypes: Record<string, ScreenShotDesc> = {
view: { view: {
type: "png", type: "png",
omitBackground: true, omitBackground: true,
fullPage: false, fullPage: false,
encoding: "binary",
}, },
thumbnail: { thumbnail: {
type: "jpeg", type: "jpeg",
omitBackground: true, omitBackground: true,
fullPage: false, fullPage: false,
encoding: "binary",
}, },
fullPage: { fullPage: {
type: "png", type: "png",
omitBackground: true, omitBackground: true,
fullPage: true, fullPage: true,
encoding: "binary",
}, },
}; };
export type ScreenshotOpts = {
browser: Browser;
page: Page;
url: string;
directory: string;
warcPrefix: string;
};
export class Screenshots extends WARCResourceWriter { export class Screenshots extends WARCResourceWriter {
browser: Browser; browser: Browser;
// TODO: Fix this the next time the file is edited. page: Page;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
page: any;
// TODO: Fix this the next time the file is edited. constructor(opts: ScreenshotOpts) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(opts: any) {
super({ ...opts, warcName: "screenshots.warc.gz" }); super({ ...opts, warcName: "screenshots.warc.gz" });
this.browser = opts.browser; this.browser = opts.browser;
this.page = opts.page; this.page = opts.page;
} }
async take(screenshotType = "view") { async take(
screenshotType: ScreeshotType = "view",
state: PageState | null = null,
) {
try { try {
if (screenshotType !== "fullPage") { if (screenshotType !== "fullPage") {
await this.browser.setViewport(this.page, { await this.browser.setViewport(this.page, {
@ -54,6 +69,9 @@ export class Screenshots extends WARCResourceWriter {
} }
const options = screenshotTypes[screenshotType]; const options = screenshotTypes[screenshotType];
const screenshotBuffer = await this.page.screenshot(options); const screenshotBuffer = await this.page.screenshot(options);
if (state && screenshotType === "view") {
state.screenshotView = screenshotBuffer;
}
await this.writeBufferToWARC( await this.writeBufferToWARC(
screenshotBuffer, screenshotBuffer,
screenshotType, screenshotType,

View file

@ -33,6 +33,8 @@ export type QueueEntry = {
seedId: number; seedId: number;
depth: number; depth: number;
extraHops: number; extraHops: number;
ts?: number;
pageid?: string;
}; };
// ============================================================================ // ============================================================================
@ -66,6 +68,7 @@ export class PageState {
isHTMLPage?: boolean; isHTMLPage?: boolean;
text?: string; text?: string;
screenshotView?: Buffer;
favicon?: string; favicon?: string;
skipBehaviors = false; skipBehaviors = false;
@ -79,7 +82,10 @@ export class PageState {
this.seedId = redisData.seedId; this.seedId = redisData.seedId;
this.depth = redisData.depth; this.depth = redisData.depth;
this.extraHops = redisData.extraHops || 0; this.extraHops = redisData.extraHops || 0;
this.pageid = uuidv4(); if (redisData.ts) {
this.ts = new Date(redisData.ts);
}
this.pageid = redisData.pageid || uuidv4();
this.status = 0; this.status = 0;
} }
} }
@ -472,12 +478,26 @@ return 0;
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) { //async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
async addToQueue( async addToQueue(
{ url, seedId, depth = 0, extraHops = 0 }: QueueEntry, {
url,
seedId,
depth = 0,
extraHops = 0,
ts = 0,
pageid = undefined,
}: QueueEntry,
limit = 0, limit = 0,
) { ) {
const added = this._timestamp(); const added = this._timestamp();
const data: QueueEntry = { added, url, seedId, depth, extraHops }; const data: QueueEntry = { added, url, seedId, depth, extraHops };
if (ts) {
data.ts = ts;
}
if (pageid) {
data.pageid = pageid;
}
// return codes // return codes
// 0 - url queued successfully // 0 - url queued successfully
// 1 - url queue size limit reached // 1 - url queue size limit reached

View file

@ -2,17 +2,25 @@ import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { CDPSession, Protocol } from "puppeteer-core"; import { CDPSession, Protocol } from "puppeteer-core";
// ============================================================================
type TextExtractOpts = {
url: string;
directory: string;
warcPrefix: string;
skipDocs: number;
};
// ============================================================================ // ============================================================================
export abstract class BaseTextExtract extends WARCResourceWriter { export abstract class BaseTextExtract extends WARCResourceWriter {
cdp: CDPSession; cdp: CDPSession;
lastText: string | null = null; lastText: string | null = null;
text: string | null = null; text: string | null = null;
skipDocs: number = 0;
// TODO: Fix this the next time the file is edited. constructor(cdp: CDPSession, opts: TextExtractOpts) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(cdp: CDPSession, opts: any) {
super({ ...opts, warcName: "text.warc.gz" }); super({ ...opts, warcName: "text.warc.gz" });
this.cdp = cdp; this.cdp = cdp;
this.skipDocs = opts.skipDocs || 0;
} }
async extractAndStoreText( async extractAndStoreText(
@ -83,7 +91,7 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
const accum: string[] = []; const accum: string[] = [];
for (const doc of documents) { for (const doc of documents.slice(this.skipDocs)) {
const nodeValues = doc.nodes.nodeValue || []; const nodeValues = doc.nodes.nodeValue || [];
const nodeNames = doc.nodes.nodeName || []; const nodeNames = doc.nodes.nodeName || [];
const nodeTypes = doc.nodes.nodeType || []; const nodeTypes = doc.nodes.nodeType || [];

View file

@ -2,10 +2,17 @@ import fs from "fs";
import path from "path"; import path from "path";
import * as warcio from "warcio"; import * as warcio from "warcio";
// ===========================================================================
export type WARCResourceWriterOpts = {
url: string;
directory: string;
date?: Date;
warcName: string;
warcPrefix: string;
};
// ===========================================================================
export class WARCResourceWriter { export class WARCResourceWriter {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
page: any;
url: string; url: string;
directory: string; directory: string;
warcName: string; warcName: string;
@ -17,13 +24,7 @@ export class WARCResourceWriter {
date, date,
warcPrefix, warcPrefix,
warcName, warcName,
}: { }: WARCResourceWriterOpts) {
url: string;
directory: string;
date: Date;
warcPrefix: string;
warcName: string;
}) {
this.url = url; this.url = url;
this.directory = directory; this.directory = directory;
this.warcName = path.join(this.directory, warcPrefix + warcName); this.warcName = path.join(this.directory, warcPrefix + warcName);

View file

@ -14,7 +14,6 @@ const NEW_WINDOW_TIMEOUT = 20;
const TEARDOWN_TIMEOUT = 10; const TEARDOWN_TIMEOUT = 10;
const FINISHED_TIMEOUT = 60; const FINISHED_TIMEOUT = 60;
// ===========================================================================
export type WorkerOpts = { export type WorkerOpts = {
page: Page; page: Page;
cdp: CDPSession; cdp: CDPSession;
@ -39,6 +38,7 @@ export class PageWorker {
maxPageTime: number; maxPageTime: number;
reuseCount = 0; reuseCount = 0;
alwaysReuse: boolean;
page?: Page | null; page?: Page | null;
cdp?: CDPSession | null; cdp?: CDPSession | null;
@ -55,27 +55,22 @@ export class PageWorker {
markCrashed?: (reason: string) => void; markCrashed?: (reason: string) => void;
crashBreak?: Promise<void>; crashBreak?: Promise<void>;
recorder: Recorder; recorder: Recorder | null;
constructor( constructor(
id: WorkerId, id: WorkerId,
crawler: Crawler, crawler: Crawler,
maxPageTime: number, maxPageTime: number,
collDir: string, alwaysReuse = false,
) { ) {
this.id = id; this.id = id;
this.crawler = crawler; this.crawler = crawler;
this.maxPageTime = maxPageTime; this.maxPageTime = maxPageTime;
this.alwaysReuse = alwaysReuse;
this.logDetails = { workerid: this.id }; this.logDetails = { workerid: this.id };
this.recorder = new Recorder({ this.recorder = this.crawler.createRecorder(this.id);
workerid: id,
collDir,
crawler: this.crawler,
});
this.crawler.browser.recorders.push(this.recorder);
} }
async closePage() { async closePage() {
@ -133,19 +128,18 @@ export class PageWorker {
} }
async initPage(url: string): Promise<WorkerOpts> { async initPage(url: string): Promise<WorkerOpts> {
if ( let reuse = !this.crashed && !!this.opts && !!this.page;
!this.crashed && if (!this.alwaysReuse) {
this.page && ++this.reuseCount;
this.opts && reuse = this.reuseCount <= MAX_REUSE && this.isSameOrigin(url);
++this.reuseCount <= MAX_REUSE && }
this.isSameOrigin(url) if (reuse) {
) {
logger.debug( logger.debug(
"Reusing page", "Reusing page",
{ reuseCount: this.reuseCount, ...this.logDetails }, { reuseCount: this.reuseCount, ...this.logDetails },
"worker", "worker",
); );
return this.opts; return this.opts!;
} else if (this.page) { } else if (this.page) {
await this.closePage(); await this.closePage();
} }
@ -176,7 +170,7 @@ export class PageWorker {
this.cdp = cdp; this.cdp = cdp;
this.callbacks = {}; this.callbacks = {};
const directFetchCapture = this.recorder const directFetchCapture = this.recorder
? (x: string) => this.recorder.directFetchCapture(x) ? (x: string) => this.recorder!.directFetchCapture(x)
: null; : null;
this.opts = { this.opts = {
page, page,
@ -405,10 +399,11 @@ export async function runWorkers(
crawler: Crawler, crawler: Crawler,
numWorkers: number, numWorkers: number,
maxPageTime: number, maxPageTime: number,
collDir: string, alwaysReuse = false,
) { ) {
logger.info(`Creating ${numWorkers} workers`, {}, "worker"); logger.info(`Creating ${numWorkers} workers`, {}, "worker");
const workers = [];
let offset = 0; let offset = 0;
// automatically set worker start by ordinal in k8s // automatically set worker start by ordinal in k8s
@ -426,7 +421,7 @@ export async function runWorkers(
} }
for (let i = 0; i < numWorkers; i++) { for (let i = 0; i < numWorkers; i++) {
workers.push(new PageWorker(i + offset, crawler, maxPageTime, collDir)); workers.push(new PageWorker(i + offset, crawler, maxPageTime, alwaysReuse));
} }
await Promise.allSettled(workers.map((worker) => worker.run())); await Promise.allSettled(workers.map((worker) => worker.run()));

View file

@ -1,6 +1,10 @@
import { exec } from "child_process"; import { exec } from "child_process";
import Redis from "ioredis"; import Redis from "ioredis";
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
test("dynamically add exclusion while crawl is running", async () => { test("dynamically add exclusion while crawl is running", async () => {
let callback = null; let callback = null;
@ -12,7 +16,7 @@ test("dynamically add exclusion while crawl is running", async () => {
try { try {
exec( exec(
"docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis", "docker run -p 36382:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
{ shell: "/bin/bash" }, { shell: "/bin/bash" },
callback, callback,
); );
@ -20,18 +24,18 @@ test("dynamically add exclusion while crawl is running", async () => {
console.log(error); console.log(error);
} }
await new Promise((resolve) => setTimeout(resolve, 3000)); await sleep(3000);
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true }); const redis = new Redis("redis://127.0.0.1:36382/0", { lazyConnect: true, retryStrategy: () => null })
await redis.connect({ maxRetriesPerRequest: 50 }); await redis.connect();
while (true) { while (true) {
if (Number(await redis.zcard("test:q")) > 1) { if (Number(await redis.zcard("test:q")) > 1) {
break; break;
} }
await new Promise((resolve) => setTimeout(resolve, 500)); await sleep(500);
} }
const uids = await redis.hkeys("test:status"); const uids = await redis.hkeys("test:status");
@ -48,6 +52,5 @@ test("dynamically add exclusion while crawl is running", async () => {
expect(stdout.indexOf("Add Exclusion") > 0).toBe(true); expect(stdout.indexOf("Add Exclusion") > 0).toBe(true);
expect(stdout.indexOf("Removing excluded URL") > 0).toBe(true); expect(stdout.indexOf("Removing excluded URL") > 0).toBe(true);
await redis.disconnect();
}); });

75
tests/qa_compare.test.js Normal file
View file

@ -0,0 +1,75 @@
import child_process from "child_process";
import fs from "fs";
import { Redis } from "ioredis";
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
test("run initial crawl with text and screenshots to prepare for QA", async () => {
fs.rmSync("./test-crawls/qa-wr-net", { recursive: true, force: true });
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://browsertrix.com/ --scopeType page --collection qa-wr-net --text to-warc --screenshot view --generateWACZ",
);
expect(
fs.existsSync("test-crawls/collections/qa-wr-net/qa-wr-net.wacz"),
).toBe(true);
});
test("run QA comparison, with write pages to redis", async () => {
fs.rmSync("./test-crawls/qa-wr-net-replay", { recursive: true, force: true });
const child = child_process.exec(
"docker run -p 36380:6379 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler qa --qaSource /crawls/collections/qa-wr-net/qa-wr-net.wacz --collection qa-wr-net-replay --crawlId test --qaDebugImageDiff --writePagesToRedis --debugAccessRedis",
);
// detect crawler exit
let crawler_exited = false;
child.on("exit", function () {
crawler_exited = true;
});
const redis = new Redis("redis://127.0.0.1:36380/0", { lazyConnect: true, retryStrategy: () => null });
await sleep(3000);
await redis.connect({ maxRetriesPerRequest: 50 });
let count = 0;
while (count < 3) {
const res = await redis.lpop("test:pages");
if (!res) {
if (crawler_exited) {
break;
}
await sleep(100);
continue;
}
const json = JSON.parse(res);
expect(json).toHaveProperty("id");
expect(json).toHaveProperty("url");
expect(json).toHaveProperty("ts");
expect(json).toHaveProperty("title");
expect(json).toHaveProperty("loadState");
expect(json).toHaveProperty("comparison");
expect(json.comparison).toHaveProperty("screenshotMatch");
expect(json.comparison).toHaveProperty("textMatch");
expect(json.comparison).toHaveProperty("resourceCounts");
expect(json.comparison.resourceCounts).toHaveProperty("crawlGood");
expect(json.comparison.resourceCounts).toHaveProperty("crawlBad");
expect(json.comparison.resourceCounts).toHaveProperty("replayGood");
expect(json.comparison.resourceCounts).toHaveProperty("replayBad");
count++;
}
expect(count).toBe(3);
// wait for crawler exit
while (!crawler_exited) {
await sleep(100);
}
});

View file

@ -117,9 +117,11 @@ test("check parsing saved state + page done + queue present", () => {
test("check crawl restarted with saved state", async () => { test("check crawl restarted with saved state", async () => {
let containerId = null; let containerId = null;
const port = 36379;
try { try {
containerId = execSync( containerId = execSync(
`docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, `docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
{ encoding: "utf-8" }, { encoding: "utf-8" },
); );
} catch (error) { } catch (error) {
@ -128,14 +130,11 @@ test("check crawl restarted with saved state", async () => {
await sleep(2000); await sleep(2000);
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true }); const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null });
try { try {
await redis.connect({ await redis.connect({
maxRetriesPerRequest: 100, maxRetriesPerRequest: 100,
retryStrategy(times) {
return times < 100 ? 1000 : null;
},
}); });
await sleep(2000); await sleep(2000);
@ -150,11 +149,5 @@ test("check crawl restarted with saved state", async () => {
console.log(e); console.log(e);
} finally { } finally {
await waitContainer(containerId); await waitContainer(containerId);
try {
await redis.disconnect();
} catch (e) {
// ignore
}
} }
}); });

View file

@ -1,7 +1,6 @@
import child_process from "child_process"; import child_process from "child_process";
import Redis from "ioredis"; import Redis from "ioredis";
function sleep(ms) { function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms)); return new Promise((resolve) => setTimeout(resolve, ms));
} }
@ -31,20 +30,17 @@ async function waitContainer(containerId) {
} }
async function runCrawl(numExpected, url, sitemap="", limit=0) { async function runCrawl(numExpected, url, sitemap="", limit=0) {
const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"}); const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
await sleep(2000); await sleep(3000);
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true }); const redis = new Redis("redis://127.0.0.1:36381/0", { lazyConnect: true, retryStrategy: () => null });
let finished = 0; let finished = 0;
try { try {
await redis.connect({ await redis.connect({
maxRetriesPerRequest: 100, maxRetriesPerRequest: 100,
retryStrategy(times) {
return times < 100 ? 1000 : null;
},
}); });
while (true) { while (true) {
@ -58,11 +54,6 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) {
console.error(e); console.error(e);
} finally { } finally {
await waitContainer(containerId); await waitContainer(containerId);
try {
await redis.disconnect();
} catch (e) {
// ignore
}
} }
expect(finished).toBeGreaterThanOrEqual(numExpected); expect(finished).toBeGreaterThanOrEqual(numExpected);
@ -79,4 +70,3 @@ test("test sitemap with limit", async () => {
test("test sitemap with limit, specific URL", async () => { test("test sitemap with limit, specific URL", async () => {
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000); await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
}); });

1156
yarn.lock

File diff suppressed because it is too large Load diff