mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
QA Crawl Support (Beta) (#469)
Initial (beta) support for QA/replay crawling! - Supports running a crawl over a given WACZ / list of WACZ (multi WACZ) input, hosted in ReplayWeb.page - Runs local http server with full-page, ui-less ReplayWeb.page embed - ReplayWeb.page release version configured in the Dockerfile, pinned ui.js and sw.js fetched directly from cdnjs Can be deployed with `webrecorder/browsertrix-crawler qa` entrypoint. - Requires `--qaSource`, pointing to WACZ or multi-WACZ json that will be replay/QAd - Also supports `--qaRedisKey` where QA comparison data will be pushed, if specified. - Supports `--qaDebugImageDiff` for outputting crawl / replay/ diff images. - If using --writePagesToRedis, a `comparison` key is added to existing page data where: ``` comparison: { screenshotMatch?: number; textMatch?: number; resourceCounts: { crawlGood?: number; crawlBad?: number; replayGood?: number; replayBad?: number; }; }; ``` - bump version to 1.1.0-beta.2
This commit is contained in:
parent
22a7351dc7
commit
bb9c82493b
22 changed files with 2068 additions and 598 deletions
6
.github/workflows/ci.yaml
vendored
6
.github/workflows/ci.yaml
vendored
|
@ -42,10 +42,10 @@ jobs:
|
||||||
run: yarn run tsc
|
run: yarn run tsc
|
||||||
- name: build docker
|
- name: build docker
|
||||||
run: docker-compose build
|
run: docker-compose build
|
||||||
- name: run jest
|
- name: run all tests as root
|
||||||
run: sudo yarn test
|
run: sudo yarn test
|
||||||
- name: run saved state test with volume owned by different user
|
- name: run saved state + qa compare test as non-root - with volume owned by current user
|
||||||
run: |
|
run: |
|
||||||
sudo rm -rf ./test-crawls
|
sudo rm -rf ./test-crawls
|
||||||
mkdir test-crawls
|
mkdir test-crawls
|
||||||
sudo yarn test ./tests/saved-state.test.js
|
sudo yarn test ./tests/saved-state.test.js ./tests/qa_compare.test.js
|
||||||
|
|
10
Dockerfile
10
Dockerfile
|
@ -48,9 +48,15 @@ ADD config/ /app/
|
||||||
|
|
||||||
ADD html/ /app/html/
|
ADD html/ /app/html/
|
||||||
|
|
||||||
RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js
|
ARG RWP_VERSION=1.8.15
|
||||||
|
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
|
||||||
|
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
|
||||||
|
|
||||||
RUN ln -s /app/dist/main.js /usr/bin/crawl; ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
|
RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/*
|
||||||
|
|
||||||
|
RUN ln -s /app/dist/main.js /usr/bin/crawl; \
|
||||||
|
ln -s /app/dist/main.js /usr/bin/qa; \
|
||||||
|
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
|
||||||
|
|
||||||
WORKDIR /crawls
|
WORKDIR /crawls
|
||||||
|
|
||||||
|
|
39
html/replay.html
Normal file
39
html/replay.html
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<script src="/ui.js"></script>
|
||||||
|
<style>
|
||||||
|
html {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
display: flex;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
width: 100%;
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
replay-web-page {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
border: 0;
|
||||||
|
position: fixed;
|
||||||
|
width: 100vw;
|
||||||
|
height: 100vh;
|
||||||
|
top: 0;
|
||||||
|
left: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<replay-web-page
|
||||||
|
embed="replayonly"
|
||||||
|
deepLink="true"
|
||||||
|
source="$SOURCE"
|
||||||
|
url="about:blank"
|
||||||
|
ts=""
|
||||||
|
coll="replay"
|
||||||
|
>
|
||||||
|
</replay-web-page>
|
||||||
|
</body>
|
||||||
|
</html>
|
10
package.json
10
package.json
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "1.0.2",
|
"version": "1.1.0-beta.2",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
@ -24,9 +24,12 @@
|
||||||
"get-folder-size": "^4.0.0",
|
"get-folder-size": "^4.0.0",
|
||||||
"husky": "^8.0.3",
|
"husky": "^8.0.3",
|
||||||
"ioredis": "^5.3.2",
|
"ioredis": "^5.3.2",
|
||||||
|
"js-levenshtein": "^1.1.6",
|
||||||
"js-yaml": "^4.1.0",
|
"js-yaml": "^4.1.0",
|
||||||
"minio": "^7.1.3",
|
"minio": "^7.1.3",
|
||||||
"p-queue": "^7.3.4",
|
"p-queue": "^7.3.4",
|
||||||
|
"pixelmatch": "^5.3.0",
|
||||||
|
"pngjs": "^7.0.0",
|
||||||
"puppeteer-core": "^20.8.2",
|
"puppeteer-core": "^20.8.2",
|
||||||
"sax": "^1.3.0",
|
"sax": "^1.3.0",
|
||||||
"sharp": "^0.32.6",
|
"sharp": "^0.32.6",
|
||||||
|
@ -37,8 +40,11 @@
|
||||||
"yargs": "^17.7.2"
|
"yargs": "^17.7.2"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@types/js-levenshtein": "^1.1.3",
|
||||||
"@types/js-yaml": "^4.0.8",
|
"@types/js-yaml": "^4.0.8",
|
||||||
"@types/node": "^20.8.7",
|
"@types/node": "^20.8.7",
|
||||||
|
"@types/pixelmatch": "^5.2.6",
|
||||||
|
"@types/pngjs": "^6.0.4",
|
||||||
"@types/uuid": "^9.0.6",
|
"@types/uuid": "^9.0.6",
|
||||||
"@types/ws": "^8.5.8",
|
"@types/ws": "^8.5.8",
|
||||||
"@typescript-eslint/eslint-plugin": "^6.10.0",
|
"@typescript-eslint/eslint-plugin": "^6.10.0",
|
||||||
|
@ -46,7 +52,7 @@
|
||||||
"eslint": "^8.53.0",
|
"eslint": "^8.53.0",
|
||||||
"eslint-config-prettier": "^9.0.0",
|
"eslint-config-prettier": "^9.0.0",
|
||||||
"eslint-plugin-react": "^7.22.0",
|
"eslint-plugin-react": "^7.22.0",
|
||||||
"jest": "^29.2.1",
|
"jest": "^29.7.0",
|
||||||
"md5": "^2.3.0",
|
"md5": "^2.3.0",
|
||||||
"prettier": "3.0.3",
|
"prettier": "3.0.3",
|
||||||
"typescript": "^5.2.2"
|
"typescript": "^5.2.2"
|
||||||
|
|
187
src/crawler.ts
187
src/crawler.ts
|
@ -13,6 +13,8 @@ import {
|
||||||
PageCallbacks,
|
PageCallbacks,
|
||||||
} from "./util/state.js";
|
} from "./util/state.js";
|
||||||
|
|
||||||
|
import { parseArgs } from "./util/argParser.js";
|
||||||
|
|
||||||
import yaml from "js-yaml";
|
import yaml from "js-yaml";
|
||||||
|
|
||||||
import * as warcio from "warcio";
|
import * as warcio from "warcio";
|
||||||
|
@ -29,7 +31,6 @@ import {
|
||||||
} from "./util/storage.js";
|
} from "./util/storage.js";
|
||||||
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
||||||
import { Screenshots } from "./util/screenshots.js";
|
import { Screenshots } from "./util/screenshots.js";
|
||||||
import { parseArgs } from "./util/argParser.js";
|
|
||||||
import { initRedis } from "./util/redis.js";
|
import { initRedis } from "./util/redis.js";
|
||||||
import { logger, formatErr } from "./util/logger.js";
|
import { logger, formatErr } from "./util/logger.js";
|
||||||
import {
|
import {
|
||||||
|
@ -57,6 +58,7 @@ import { OriginOverride } from "./util/originoverride.js";
|
||||||
import { Agent as HTTPAgent } from "http";
|
import { Agent as HTTPAgent } from "http";
|
||||||
import { Agent as HTTPSAgent } from "https";
|
import { Agent as HTTPSAgent } from "https";
|
||||||
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
|
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
|
||||||
|
import { Recorder } from "./util/recorder.js";
|
||||||
import { SitemapReader } from "./util/sitemapper.js";
|
import { SitemapReader } from "./util/sitemapper.js";
|
||||||
import { ScopedSeed } from "./util/seeds.js";
|
import { ScopedSeed } from "./util/seeds.js";
|
||||||
|
|
||||||
|
@ -146,6 +148,8 @@ export class Crawler {
|
||||||
pagesDir: string;
|
pagesDir: string;
|
||||||
pagesFile: string;
|
pagesFile: string;
|
||||||
|
|
||||||
|
archivesDir: string;
|
||||||
|
|
||||||
blockRules: BlockRules | null;
|
blockRules: BlockRules | null;
|
||||||
adBlockRules: AdBlockRules | null;
|
adBlockRules: AdBlockRules | null;
|
||||||
|
|
||||||
|
@ -154,11 +158,15 @@ export class Crawler {
|
||||||
|
|
||||||
screencaster: ScreenCaster | null = null;
|
screencaster: ScreenCaster | null = null;
|
||||||
|
|
||||||
|
skipTextDocs = 0;
|
||||||
|
|
||||||
interrupted = false;
|
interrupted = false;
|
||||||
finalExit = false;
|
finalExit = false;
|
||||||
uploadAndDeleteLocal = false;
|
uploadAndDeleteLocal = false;
|
||||||
done = false;
|
done = false;
|
||||||
|
|
||||||
|
textInPages = false;
|
||||||
|
|
||||||
customBehaviors = "";
|
customBehaviors = "";
|
||||||
behaviorsChecked = false;
|
behaviorsChecked = false;
|
||||||
behaviorLastLine?: string;
|
behaviorLastLine?: string;
|
||||||
|
@ -178,10 +186,12 @@ export class Crawler {
|
||||||
crawler: Crawler;
|
crawler: Crawler;
|
||||||
}) => NonNullable<unknown>;
|
}) => NonNullable<unknown>;
|
||||||
|
|
||||||
|
recording = true;
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
const res = parseArgs();
|
const args = this.parseArgs();
|
||||||
this.params = res.parsed;
|
this.params = args.parsed;
|
||||||
this.origConfig = res.origConfig;
|
this.origConfig = args.origConfig;
|
||||||
|
|
||||||
// root collections dir
|
// root collections dir
|
||||||
this.collDir = path.join(
|
this.collDir = path.join(
|
||||||
|
@ -259,6 +269,9 @@ export class Crawler {
|
||||||
// pages file
|
// pages file
|
||||||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||||
|
|
||||||
|
// archives dir
|
||||||
|
this.archivesDir = path.join(this.collDir, "archive");
|
||||||
|
|
||||||
this.blockRules = null;
|
this.blockRules = null;
|
||||||
this.adBlockRules = null;
|
this.adBlockRules = null;
|
||||||
|
|
||||||
|
@ -268,6 +281,8 @@ export class Crawler {
|
||||||
this.finalExit = false;
|
this.finalExit = false;
|
||||||
this.uploadAndDeleteLocal = false;
|
this.uploadAndDeleteLocal = false;
|
||||||
|
|
||||||
|
this.textInPages = this.params.text.includes("to-pages");
|
||||||
|
|
||||||
this.done = false;
|
this.done = false;
|
||||||
|
|
||||||
this.customBehaviors = "";
|
this.customBehaviors = "";
|
||||||
|
@ -281,6 +296,10 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected parseArgs() {
|
||||||
|
return parseArgs();
|
||||||
|
}
|
||||||
|
|
||||||
configureUA() {
|
configureUA() {
|
||||||
// override userAgent
|
// override userAgent
|
||||||
if (this.params.userAgent) {
|
if (this.params.userAgent) {
|
||||||
|
@ -434,7 +453,9 @@ export class Crawler {
|
||||||
// logger.info("wb-manager init failed, collection likely already exists");
|
// logger.info("wb-manager init failed, collection likely already exists");
|
||||||
//}
|
//}
|
||||||
|
|
||||||
fs.mkdirSync(this.logDir, { recursive: true });
|
await fsp.mkdir(this.logDir, { recursive: true });
|
||||||
|
await fsp.mkdir(this.archivesDir, { recursive: true });
|
||||||
|
|
||||||
this.logFH = fs.createWriteStream(this.logFilename);
|
this.logFH = fs.createWriteStream(this.logFilename);
|
||||||
logger.setExternalLogStream(this.logFH);
|
logger.setExternalLogStream(this.logFH);
|
||||||
|
|
||||||
|
@ -721,10 +742,10 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawlPage(opts: WorkerState) {
|
async crawlPage(opts: WorkerState): Promise<void> {
|
||||||
await this.writeStats();
|
await this.writeStats();
|
||||||
|
|
||||||
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
|
const { page, data, workerid, callbacks, directFetchCapture } = opts;
|
||||||
data.callbacks = callbacks;
|
data.callbacks = callbacks;
|
||||||
|
|
||||||
const { url } = data;
|
const { url } = data;
|
||||||
|
@ -764,7 +785,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
{ url, ...logDetails },
|
{ url, ...logDetails },
|
||||||
"fetch",
|
"fetch",
|
||||||
);
|
);
|
||||||
return true;
|
return;
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// filtered out direct fetch
|
// filtered out direct fetch
|
||||||
|
@ -782,7 +803,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
data.title = await page.title();
|
data.title = await page.title();
|
||||||
data.favicon = await this.getFavicon(page, logDetails);
|
data.favicon = await this.getFavicon(page, logDetails);
|
||||||
|
|
||||||
const archiveDir = path.join(this.collDir, "archive");
|
await this.doPostLoadActions(opts);
|
||||||
|
}
|
||||||
|
|
||||||
|
async doPostLoadActions(opts: WorkerState, saveOutput = false) {
|
||||||
|
const { page, cdp, data, workerid } = opts;
|
||||||
|
const { url } = data;
|
||||||
|
|
||||||
|
const logDetails = { page: url, workerid };
|
||||||
|
|
||||||
if (this.params.screenshot) {
|
if (this.params.screenshot) {
|
||||||
if (!data.isHTMLPage) {
|
if (!data.isHTMLPage) {
|
||||||
|
@ -793,10 +821,10 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
browser: this.browser,
|
browser: this.browser,
|
||||||
page,
|
page,
|
||||||
url,
|
url,
|
||||||
directory: archiveDir,
|
directory: this.archivesDir,
|
||||||
});
|
});
|
||||||
if (this.params.screenshot.includes("view")) {
|
if (this.params.screenshot.includes("view")) {
|
||||||
await screenshots.take();
|
await screenshots.take("view", saveOutput ? data : null);
|
||||||
}
|
}
|
||||||
if (this.params.screenshot.includes("fullPage")) {
|
if (this.params.screenshot.includes("fullPage")) {
|
||||||
await screenshots.takeFullPage();
|
await screenshots.takeFullPage();
|
||||||
|
@ -812,15 +840,16 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
textextract = new TextExtractViaSnapshot(cdp, {
|
textextract = new TextExtractViaSnapshot(cdp, {
|
||||||
warcPrefix: this.warcPrefix,
|
warcPrefix: this.warcPrefix,
|
||||||
url,
|
url,
|
||||||
directory: archiveDir,
|
directory: this.archivesDir,
|
||||||
|
skipDocs: this.skipTextDocs,
|
||||||
});
|
});
|
||||||
const { changed, text } = await textextract.extractAndStoreText(
|
const { text } = await textextract.extractAndStoreText(
|
||||||
"text",
|
"text",
|
||||||
false,
|
false,
|
||||||
this.params.text.includes("to-warc"),
|
this.params.text.includes("to-warc"),
|
||||||
);
|
);
|
||||||
|
|
||||||
if (changed && text && this.params.text.includes("to-pages")) {
|
if (text && (this.textInPages || saveOutput)) {
|
||||||
data.text = text;
|
data.text = text;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -868,8 +897,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
);
|
);
|
||||||
await sleep(this.params.pageExtraDelay);
|
await sleep(this.params.pageExtraDelay);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async pageFinished(data: PageState) {
|
async pageFinished(data: PageState) {
|
||||||
|
@ -1047,8 +1074,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
async checkLimits() {
|
async checkLimits() {
|
||||||
let interrupt = false;
|
let interrupt = false;
|
||||||
|
|
||||||
const dir = path.join(this.collDir, "archive");
|
const size = await getDirSize(this.archivesDir);
|
||||||
const size = await getDirSize(dir);
|
|
||||||
|
|
||||||
await this.crawlState.setArchiveSize(size);
|
await this.crawlState.setArchiveSize(size);
|
||||||
|
|
||||||
|
@ -1230,28 +1256,11 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
this.screencaster = this.initScreenCaster();
|
this.screencaster = this.initScreenCaster();
|
||||||
|
|
||||||
if (this.params.originOverride.length) {
|
if (this.params.originOverride && this.params.originOverride.length) {
|
||||||
this.originOverride = new OriginOverride(this.params.originOverride);
|
this.originOverride = new OriginOverride(this.params.originOverride);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
await this._addInitialSeeds();
|
||||||
const seed = this.params.scopedSeeds[i];
|
|
||||||
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
|
|
||||||
if (this.limitHit) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (seed.sitemap) {
|
|
||||||
await timedRun(
|
|
||||||
this.parseSitemap(seed, i),
|
|
||||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
|
||||||
"Sitemap initial fetch timed out",
|
|
||||||
{ sitemap: seed.sitemap, seed: seed.url },
|
|
||||||
"sitemap",
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
await this.browser.launch({
|
await this.browser.launch({
|
||||||
profileUrl: this.params.profile,
|
profileUrl: this.params.profile,
|
||||||
|
@ -1272,12 +1281,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
"browser",
|
"browser",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
recording: this.recording,
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} as any);
|
} as any);
|
||||||
|
|
||||||
// --------------
|
// --------------
|
||||||
// Run Crawl Here!
|
// Run Crawl Here!
|
||||||
await runWorkers(this, this.params.workers, this.maxPageTime, this.collDir);
|
await runWorkers(this, this.params.workers, this.maxPageTime);
|
||||||
// --------------
|
// --------------
|
||||||
|
|
||||||
await this.serializeConfig(true);
|
await this.serializeConfig(true);
|
||||||
|
@ -1297,6 +1308,27 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.postCrawl();
|
await this.postCrawl();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected async _addInitialSeeds() {
|
||||||
|
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
||||||
|
const seed = this.params.scopedSeeds[i];
|
||||||
|
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
|
||||||
|
if (this.limitHit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seed.sitemap) {
|
||||||
|
await timedRun(
|
||||||
|
this.parseSitemap(seed, i),
|
||||||
|
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||||
|
"Sitemap initial fetch timed out",
|
||||||
|
{ sitemap: seed.sitemap, seed: seed.url },
|
||||||
|
"sitemap",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async postCrawl() {
|
async postCrawl() {
|
||||||
if (this.params.combineWARC) {
|
if (this.params.combineWARC) {
|
||||||
await this.combineWARC();
|
await this.combineWARC();
|
||||||
|
@ -1307,9 +1339,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
|
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
|
||||||
await this.crawlState.setStatus("generate-cdx");
|
await this.crawlState.setStatus("generate-cdx");
|
||||||
|
|
||||||
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
const warcList = await fsp.readdir(this.archivesDir);
|
||||||
const warcListFull = warcList.map((filename) =>
|
const warcListFull = warcList.map((filename) =>
|
||||||
path.join(this.collDir, "archive", filename),
|
path.join(this.archivesDir, filename),
|
||||||
);
|
);
|
||||||
|
|
||||||
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||||
|
@ -1377,10 +1409,8 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.info("Generating WACZ");
|
logger.info("Generating WACZ");
|
||||||
await this.crawlState.setStatus("generate-wacz");
|
await this.crawlState.setStatus("generate-wacz");
|
||||||
|
|
||||||
const archiveDir = path.join(this.collDir, "archive");
|
|
||||||
|
|
||||||
// Get a list of the warcs inside
|
// Get a list of the warcs inside
|
||||||
const warcFileList = await fsp.readdir(archiveDir);
|
const warcFileList = await fsp.readdir(this.archivesDir);
|
||||||
|
|
||||||
// is finished (>0 pages and all pages written)
|
// is finished (>0 pages and all pages written)
|
||||||
const isFinished = await this.crawlState.isFinished();
|
const isFinished = await this.crawlState.isFinished();
|
||||||
|
@ -1440,7 +1470,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
createArgs.push("-f");
|
createArgs.push("-f");
|
||||||
|
|
||||||
warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val)));
|
warcFileList.forEach((val) =>
|
||||||
|
createArgs.push(path.join(this.archivesDir, val)),
|
||||||
|
);
|
||||||
|
|
||||||
// create WACZ
|
// create WACZ
|
||||||
const waczResult = await this.awaitProcess(
|
const waczResult = await this.awaitProcess(
|
||||||
|
@ -1900,13 +1932,15 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
depth: number,
|
depth: number,
|
||||||
extraHops: number,
|
extraHops: number,
|
||||||
logDetails: LogDetails = {},
|
logDetails: LogDetails = {},
|
||||||
|
ts = 0,
|
||||||
|
pageid?: string,
|
||||||
) {
|
) {
|
||||||
if (this.limitHit) {
|
if (this.limitHit) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = await this.crawlState.addToQueue(
|
const result = await this.crawlState.addToQueue(
|
||||||
{ url, seedId, depth, extraHops },
|
{ url, seedId, depth, extraHops, ts, pageid },
|
||||||
this.pageLimit,
|
this.pageLimit,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -1954,7 +1988,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
id: "pages",
|
id: "pages",
|
||||||
title: "All Pages",
|
title: "All Pages",
|
||||||
};
|
};
|
||||||
header["hasText"] = this.params.text.includes("to-pages");
|
header["hasText"] = String(this.textInPages);
|
||||||
if (this.params.text.length) {
|
if (this.params.text.length) {
|
||||||
logger.debug("Text Extraction: " + this.params.text.join(","));
|
logger.debug("Text Extraction: " + this.params.text.join(","));
|
||||||
} else {
|
} else {
|
||||||
|
@ -1968,20 +2002,30 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async writePage({
|
protected pageEntryForRedis(
|
||||||
pageid,
|
entry: Record<string, string | number | boolean | object>,
|
||||||
url,
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||||
depth,
|
state: PageState,
|
||||||
title,
|
) {
|
||||||
text,
|
return entry;
|
||||||
loadState,
|
}
|
||||||
mime,
|
|
||||||
favicon,
|
|
||||||
ts,
|
|
||||||
status,
|
|
||||||
}: PageState) {
|
|
||||||
const row: PageEntry = { id: pageid!, url, title, loadState };
|
|
||||||
|
|
||||||
|
async writePage(state: PageState) {
|
||||||
|
const {
|
||||||
|
pageid,
|
||||||
|
url,
|
||||||
|
depth,
|
||||||
|
title,
|
||||||
|
text,
|
||||||
|
loadState,
|
||||||
|
mime,
|
||||||
|
favicon,
|
||||||
|
status,
|
||||||
|
} = state;
|
||||||
|
|
||||||
|
const row: PageEntry = { id: pageid, url, title, loadState };
|
||||||
|
|
||||||
|
let { ts } = state;
|
||||||
if (!ts) {
|
if (!ts) {
|
||||||
ts = new Date();
|
ts = new Date();
|
||||||
logger.warn("Page date missing, setting to now", { url, ts });
|
logger.warn("Page date missing, setting to now", { url, ts });
|
||||||
|
@ -1998,14 +2042,16 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.writePagesToRedis) {
|
if (this.params.writePagesToRedis) {
|
||||||
await this.crawlState.writeToPagesQueue(JSON.stringify(row));
|
await this.crawlState.writeToPagesQueue(
|
||||||
|
JSON.stringify(this.pageEntryForRedis(row, state)),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (depth === 0) {
|
if (depth === 0) {
|
||||||
row.seed = true;
|
row.seed = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (text) {
|
if (text && this.textInPages) {
|
||||||
row.text = text;
|
row.text = text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2151,7 +2197,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.crawlState.setStatus("generate-warc");
|
await this.crawlState.setStatus("generate-warc");
|
||||||
|
|
||||||
// Get the list of created Warcs
|
// Get the list of created Warcs
|
||||||
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
|
const warcLists = await fsp.readdir(this.archivesDir);
|
||||||
|
|
||||||
logger.debug(`Combining ${warcLists.length} WARCs...`);
|
logger.debug(`Combining ${warcLists.length} WARCs...`);
|
||||||
|
|
||||||
|
@ -2159,7 +2205,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
|
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
|
||||||
for (let i = 0; i < warcLists.length; i++) {
|
for (let i = 0; i < warcLists.length; i++) {
|
||||||
const fileName = path.join(this.collDir, "archive", warcLists[i]);
|
const fileName = path.join(this.archivesDir, warcLists[i]);
|
||||||
const fileSize = await getFileSize(fileName);
|
const fileSize = await getFileSize(fileName);
|
||||||
fileSizeObjects.push({ fileSize: fileSize, fileName: fileName });
|
fileSizeObjects.push({ fileSize: fileSize, fileName: fileName });
|
||||||
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
|
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
|
||||||
|
@ -2316,6 +2362,21 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.storage.uploadFile(filename, targetFilename);
|
await this.storage.uploadFile(filename, targetFilename);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
createRecorder(id: number): Recorder | null {
|
||||||
|
if (!this.recording) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const res = new Recorder({
|
||||||
|
workerid: id,
|
||||||
|
collDir: this.collDir,
|
||||||
|
crawler: this,
|
||||||
|
});
|
||||||
|
|
||||||
|
this.browser.recorders.push(res);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function shouldIgnoreAbort(req: HTTPRequest) {
|
function shouldIgnoreAbort(req: HTTPRequest) {
|
||||||
|
|
|
@ -186,6 +186,7 @@ async function main() {
|
||||||
"--test-type",
|
"--test-type",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
recording: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
import { logger } from "./util/logger.js";
|
import { logger } from "./util/logger.js";
|
||||||
import { setExitOnRedisError } from "./util/redis.js";
|
import { setExitOnRedisError } from "./util/redis.js";
|
||||||
import { Crawler } from "./crawler.js";
|
import { Crawler } from "./crawler.js";
|
||||||
|
import { ReplayCrawler } from "./replaycrawler.js";
|
||||||
|
|
||||||
let crawler: Crawler | null = null;
|
let crawler: Crawler | null = null;
|
||||||
|
|
||||||
|
@ -49,5 +50,10 @@ process.on("SIGABRT", async () => {
|
||||||
forceTerm = true;
|
forceTerm = true;
|
||||||
});
|
});
|
||||||
|
|
||||||
crawler = new Crawler();
|
if (process.argv[1].endsWith("qa")) {
|
||||||
|
crawler = new ReplayCrawler();
|
||||||
|
} else {
|
||||||
|
crawler = new Crawler();
|
||||||
|
}
|
||||||
|
|
||||||
crawler.run();
|
crawler.run();
|
||||||
|
|
731
src/replaycrawler.ts
Normal file
731
src/replaycrawler.ts
Normal file
|
@ -0,0 +1,731 @@
|
||||||
|
import { Page, Protocol } from "puppeteer-core";
|
||||||
|
import { Crawler } from "./crawler.js";
|
||||||
|
import { ReplayServer } from "./util/replayserver.js";
|
||||||
|
import { sleep } from "./util/timing.js";
|
||||||
|
import { logger } from "./util/logger.js";
|
||||||
|
import { WorkerOpts, WorkerState } from "./util/worker.js";
|
||||||
|
import { PageState } from "./util/state.js";
|
||||||
|
import { PageInfoRecord, PageInfoValue, Recorder } from "./util/recorder.js";
|
||||||
|
|
||||||
|
import fsp from "fs/promises";
|
||||||
|
import path from "path";
|
||||||
|
|
||||||
|
// @ts-expect-error wabac.js
|
||||||
|
import { ZipRangeReader } from "@webrecorder/wabac/src/wacz/ziprangereader.js";
|
||||||
|
// @ts-expect-error wabac.js
|
||||||
|
import { createLoader } from "@webrecorder/wabac/src/blockloaders.js";
|
||||||
|
|
||||||
|
import { AsyncIterReader } from "warcio";
|
||||||
|
import { WARCResourceWriter } from "./util/warcresourcewriter.js";
|
||||||
|
import { parseArgs } from "./util/argParser.js";
|
||||||
|
|
||||||
|
import { PNG } from "pngjs";
|
||||||
|
import pixelmatch from "pixelmatch";
|
||||||
|
|
||||||
|
import levenshtein from "js-levenshtein";
|
||||||
|
import { MAX_URL_LENGTH } from "./util/reqresp.js";
|
||||||
|
import { openAsBlob } from "fs";
|
||||||
|
|
||||||
|
// RWP Replay Prefix
|
||||||
|
const REPLAY_PREFIX = "http://localhost:9990/replay/w/replay/";
|
||||||
|
|
||||||
|
// RWP Source Url
|
||||||
|
const REPLAY_SOURCE = "http://localhost:9990/replay/?source=";
|
||||||
|
|
||||||
|
// When iterating over page.frames(), the first two frames are for the top-level page
|
||||||
|
// and RWP embed, the actual content starts with frame index 2
|
||||||
|
const SKIP_FRAMES = 2;
|
||||||
|
|
||||||
|
type ReplayPage = {
|
||||||
|
url: string;
|
||||||
|
ts: number;
|
||||||
|
id: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type ComparisonData = {
|
||||||
|
comparison: {
|
||||||
|
screenshotMatch?: number;
|
||||||
|
textMatch?: number;
|
||||||
|
resourceCounts: {
|
||||||
|
crawlGood?: number;
|
||||||
|
crawlBad?: number;
|
||||||
|
replayGood?: number;
|
||||||
|
replayBad?: number;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
type ReplayPageInfoRecord = PageInfoRecord & ComparisonData;
|
||||||
|
|
||||||
|
type ComparisonPageState = PageState & ComparisonData;
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Crawler designed to run over replay of existing WACZ files to generate comparison
|
||||||
|
// data (eg. for QA)
|
||||||
|
export class ReplayCrawler extends Crawler {
|
||||||
|
replayServer: ReplayServer;
|
||||||
|
qaSource: string;
|
||||||
|
|
||||||
|
pageInfos: Map<Page, ReplayPageInfoRecord>;
|
||||||
|
|
||||||
|
reloadTimeouts: WeakMap<Page, NodeJS.Timeout>;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.recording = false;
|
||||||
|
if (!this.params.qaSource) {
|
||||||
|
throw new Error("Missing QA source");
|
||||||
|
}
|
||||||
|
this.qaSource = this.params.qaSource;
|
||||||
|
this.replayServer = new ReplayServer(this.qaSource);
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Replay Crawl with Source",
|
||||||
|
{ source: this.qaSource },
|
||||||
|
"general",
|
||||||
|
);
|
||||||
|
|
||||||
|
this.pageInfos = new Map<Page, ReplayPageInfoRecord>();
|
||||||
|
|
||||||
|
// skip text from first two frames, as they are RWP boilerplate
|
||||||
|
this.skipTextDocs = SKIP_FRAMES;
|
||||||
|
|
||||||
|
this.params.scopedSeeds = [];
|
||||||
|
|
||||||
|
this.params.screenshot = ["view"];
|
||||||
|
this.params.text = ["to-warc"];
|
||||||
|
|
||||||
|
this.params.serviceWorker = "enabled";
|
||||||
|
|
||||||
|
this.reloadTimeouts = new WeakMap<Page, NodeJS.Timeout>();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected parseArgs() {
|
||||||
|
return parseArgs(process.argv, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
async setupPage(opts: WorkerState) {
|
||||||
|
await super.setupPage(opts);
|
||||||
|
const { page, cdp } = opts;
|
||||||
|
|
||||||
|
if (!this.qaSource) {
|
||||||
|
throw new Error("Missing QA source");
|
||||||
|
}
|
||||||
|
|
||||||
|
await cdp.send("Network.enable");
|
||||||
|
|
||||||
|
cdp.on("Network.responseReceived", async (params) =>
|
||||||
|
this.handlePageResourceResponse(params, page),
|
||||||
|
);
|
||||||
|
|
||||||
|
cdp.on("Network.requestWillBeSent", (params) =>
|
||||||
|
this.handleRequestWillBeSent(params, page),
|
||||||
|
);
|
||||||
|
|
||||||
|
await page.goto(this.replayServer.homePage);
|
||||||
|
|
||||||
|
// wait until content frame is available
|
||||||
|
while (page.frames().length < SKIP_FRAMES) {
|
||||||
|
await sleep(5);
|
||||||
|
}
|
||||||
|
|
||||||
|
const frame = page.frames()[1];
|
||||||
|
|
||||||
|
await frame.evaluate(() => {
|
||||||
|
return navigator.serviceWorker.ready;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
protected async _addInitialSeeds() {
|
||||||
|
await this.loadPages(this.qaSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
isInScope() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async loadPages(url: string) {
|
||||||
|
let path = url;
|
||||||
|
|
||||||
|
try {
|
||||||
|
path = new URL(url).pathname;
|
||||||
|
} catch (e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.endsWith(".wacz")) {
|
||||||
|
await this.loadPagesForWACZ(url);
|
||||||
|
} else if (path.endsWith(".json")) {
|
||||||
|
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
||||||
|
const blob = await openAsBlob(url);
|
||||||
|
url = URL.createObjectURL(blob);
|
||||||
|
}
|
||||||
|
|
||||||
|
const resp = await fetch(url);
|
||||||
|
const json = await resp.json();
|
||||||
|
|
||||||
|
// if json contains pages, just load them directly
|
||||||
|
if (json.pages) {
|
||||||
|
await this.loadPagesDirect(json.pages);
|
||||||
|
} else {
|
||||||
|
// otherwise, parse pages from WACZ files
|
||||||
|
for (const entry of json.resources) {
|
||||||
|
if (entry.path) {
|
||||||
|
await this.loadPages(entry.path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.warn("Unknown replay source", { url }, "replay");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async loadPagesForWACZ(url: string) {
|
||||||
|
const loader = new WACZLoader(url);
|
||||||
|
await loader.init();
|
||||||
|
|
||||||
|
let count = 0;
|
||||||
|
|
||||||
|
const pagesReader = await loader.loadFile("pages/pages.jsonl");
|
||||||
|
|
||||||
|
if (pagesReader) {
|
||||||
|
for await (const buff of pagesReader.iterLines()) {
|
||||||
|
await this.addPage(buff, count++);
|
||||||
|
if (this.limitHit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const extraPagesReader = await loader.loadFile("pages/extraPages.jsonl");
|
||||||
|
|
||||||
|
if (extraPagesReader) {
|
||||||
|
for await (const buff of extraPagesReader.iterLines()) {
|
||||||
|
await this.addPage(buff, count++);
|
||||||
|
if (this.limitHit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async loadPagesDirect(pages: ReplayPage[]) {
|
||||||
|
let depth = 0;
|
||||||
|
for (const entry of pages) {
|
||||||
|
const { url, ts, id } = entry;
|
||||||
|
if (!url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (this.limitHit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
await this.queueUrl(0, url, depth++, 0, {}, ts, id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async addPage(page: string, depth: number) {
|
||||||
|
let pageData: ReplayPage;
|
||||||
|
|
||||||
|
if (!page.length) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
pageData = JSON.parse(page);
|
||||||
|
} catch (e) {
|
||||||
|
console.log(page, e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const { url, ts, id } = pageData;
|
||||||
|
if (!url) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.queueUrl(0, url, depth, 0, {}, ts, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
extraChromeArgs(): string[] {
|
||||||
|
return [...super.extraChromeArgs(), "--disable-web-security"];
|
||||||
|
}
|
||||||
|
|
||||||
|
handleRequestWillBeSent(
|
||||||
|
params: Protocol.Network.RequestWillBeSentEvent,
|
||||||
|
page: Page,
|
||||||
|
) {
|
||||||
|
// only handling redirect here, committing last response in redirect chain
|
||||||
|
const { redirectResponse, type } = params;
|
||||||
|
if (redirectResponse) {
|
||||||
|
const { url, status, mimeType } = redirectResponse;
|
||||||
|
this.addPageResource(url, page, { status, mime: mimeType, type });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async handlePageResourceResponse(
|
||||||
|
params: Protocol.Network.ResponseReceivedEvent,
|
||||||
|
page: Page,
|
||||||
|
) {
|
||||||
|
const { response } = params;
|
||||||
|
const { url, status } = response;
|
||||||
|
if (!url.startsWith(REPLAY_PREFIX)) {
|
||||||
|
if (url.startsWith(REPLAY_SOURCE)) {
|
||||||
|
const { mimeType, fromServiceWorker } = response;
|
||||||
|
if (
|
||||||
|
!fromServiceWorker &&
|
||||||
|
mimeType === "application/json" &&
|
||||||
|
page.frames().length > 1
|
||||||
|
) {
|
||||||
|
const frame = page.frames()[1];
|
||||||
|
const timeoutid = setTimeout(() => {
|
||||||
|
logger.warn("Reloading RWP Frame, not inited", { url }, "replay");
|
||||||
|
frame.evaluate("window.location.reload();");
|
||||||
|
}, 10000);
|
||||||
|
this.reloadTimeouts.set(page, timeoutid);
|
||||||
|
} else if (fromServiceWorker && mimeType !== "application/json") {
|
||||||
|
const timeoutid = this.reloadTimeouts.get(page);
|
||||||
|
if (timeoutid) {
|
||||||
|
clearTimeout(timeoutid);
|
||||||
|
this.reloadTimeouts.delete(page);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const { type } = params;
|
||||||
|
const { mimeType } = response;
|
||||||
|
|
||||||
|
this.addPageResource(url, page, { status, mime: mimeType, type });
|
||||||
|
}
|
||||||
|
|
||||||
|
addPageResource(
|
||||||
|
url: string,
|
||||||
|
page: Page,
|
||||||
|
{ status, mime, type }: PageInfoValue,
|
||||||
|
) {
|
||||||
|
const inx = url.indexOf("_/");
|
||||||
|
if (inx <= 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let replayUrl = url.slice(inx + 2, MAX_URL_LENGTH);
|
||||||
|
|
||||||
|
const pageInfo = this.pageInfos.get(page);
|
||||||
|
|
||||||
|
if (!pageInfo) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (replayUrl.startsWith("//")) {
|
||||||
|
try {
|
||||||
|
replayUrl = new URL(replayUrl, pageInfo.url).href;
|
||||||
|
} catch (e) {
|
||||||
|
//
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (replayUrl.startsWith("http://") || replayUrl.startsWith("https://")) {
|
||||||
|
pageInfo.urls[replayUrl] = { status, mime, type };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async crawlPage(opts: WorkerState): Promise<void> {
|
||||||
|
await this.writeStats();
|
||||||
|
|
||||||
|
const { page, data } = opts;
|
||||||
|
const { url, ts, pageid } = data;
|
||||||
|
|
||||||
|
if (!ts) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const date = new Date(ts);
|
||||||
|
|
||||||
|
const timestamp = date.toISOString().slice(0, 19).replace(/[T:-]/g, "");
|
||||||
|
|
||||||
|
logger.info("Loading Replay", { url, timestamp }, "replay");
|
||||||
|
|
||||||
|
const pageInfo = {
|
||||||
|
pageid,
|
||||||
|
urls: {},
|
||||||
|
url,
|
||||||
|
ts: date,
|
||||||
|
comparison: { resourceCounts: {} },
|
||||||
|
counts: { jsErrors: 0 },
|
||||||
|
};
|
||||||
|
this.pageInfos.set(page, pageInfo);
|
||||||
|
|
||||||
|
await page.evaluate(
|
||||||
|
(url, ts) => {
|
||||||
|
const rwp = document.querySelector("replay-web-page");
|
||||||
|
if (!rwp) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const p = new Promise<void>((resolve) => {
|
||||||
|
window.addEventListener(
|
||||||
|
"message",
|
||||||
|
(e) => {
|
||||||
|
if (e.data && e.data.url && e.data.view) {
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ once: true },
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
rwp.setAttribute("url", url);
|
||||||
|
rwp.setAttribute("ts", ts ? ts : "");
|
||||||
|
return p;
|
||||||
|
},
|
||||||
|
url,
|
||||||
|
timestamp,
|
||||||
|
);
|
||||||
|
|
||||||
|
// optionally reload (todo: reevaluate if this is needed)
|
||||||
|
// await page.reload();
|
||||||
|
|
||||||
|
await sleep(10);
|
||||||
|
|
||||||
|
data.isHTMLPage = true;
|
||||||
|
|
||||||
|
// skipping RWP frames
|
||||||
|
data.filteredFrames = page.frames().slice(SKIP_FRAMES);
|
||||||
|
|
||||||
|
try {
|
||||||
|
data.title = await data.filteredFrames[0].title();
|
||||||
|
} catch (e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
|
||||||
|
data.favicon = await this.getFavicon(page, {});
|
||||||
|
|
||||||
|
await this.doPostLoadActions(opts, true);
|
||||||
|
|
||||||
|
await this.compareScreenshots(page, data, url, date);
|
||||||
|
|
||||||
|
await this.compareText(page, data, url, date);
|
||||||
|
|
||||||
|
await this.compareResources(page, data, url, date);
|
||||||
|
|
||||||
|
await this.processPageInfo(page, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
async compareScreenshots(
|
||||||
|
page: Page,
|
||||||
|
state: PageState,
|
||||||
|
url: string,
|
||||||
|
date?: Date,
|
||||||
|
) {
|
||||||
|
const origScreenshot = await this.fetchOrigBinary(
|
||||||
|
page,
|
||||||
|
"view",
|
||||||
|
url,
|
||||||
|
date ? date.toISOString().replace(/[^\d]/g, "") : "",
|
||||||
|
);
|
||||||
|
const { pageid, screenshotView } = state;
|
||||||
|
|
||||||
|
if (!origScreenshot || !origScreenshot.length) {
|
||||||
|
logger.warn("Orig screenshot missing for comparison", { url }, "replay");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!screenshotView || !screenshotView.length) {
|
||||||
|
logger.warn(
|
||||||
|
"Replay screenshot missing for comparison",
|
||||||
|
{ url },
|
||||||
|
"replay",
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const crawl = PNG.sync.read(origScreenshot);
|
||||||
|
const replay = PNG.sync.read(screenshotView);
|
||||||
|
|
||||||
|
const { width, height } = replay;
|
||||||
|
const diff = new PNG({ width, height });
|
||||||
|
|
||||||
|
const res = pixelmatch(crawl.data, replay.data, diff.data, width, height, {
|
||||||
|
threshold: 0.1,
|
||||||
|
alpha: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
const total = width * height;
|
||||||
|
|
||||||
|
const matchPercent = (total - res) / total;
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Screenshot Diff",
|
||||||
|
{
|
||||||
|
url,
|
||||||
|
diff: res,
|
||||||
|
matchPercent,
|
||||||
|
},
|
||||||
|
"replay",
|
||||||
|
);
|
||||||
|
|
||||||
|
if (res && this.params.qaDebugImageDiff) {
|
||||||
|
const dir = path.join(this.collDir, "screenshots", pageid || "unknown");
|
||||||
|
await fsp.mkdir(dir, { recursive: true });
|
||||||
|
await fsp.writeFile(path.join(dir, "crawl.png"), PNG.sync.write(crawl));
|
||||||
|
await fsp.writeFile(path.join(dir, "replay.png"), PNG.sync.write(replay));
|
||||||
|
await fsp.writeFile(path.join(dir, "diff.png"), PNG.sync.write(diff));
|
||||||
|
}
|
||||||
|
|
||||||
|
const pageInfo = this.pageInfos.get(page);
|
||||||
|
if (pageInfo) {
|
||||||
|
pageInfo.comparison.screenshotMatch = matchPercent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async compareText(page: Page, state: PageState, url: string, date?: Date) {
|
||||||
|
const origText = await this.fetchOrigText(
|
||||||
|
page,
|
||||||
|
"text",
|
||||||
|
url,
|
||||||
|
date ? date.toISOString().replace(/[^\d]/g, "") : "",
|
||||||
|
);
|
||||||
|
const replayText = state.text;
|
||||||
|
|
||||||
|
if (!origText || !replayText) {
|
||||||
|
logger.warn(
|
||||||
|
"Text missing for comparison",
|
||||||
|
{
|
||||||
|
url,
|
||||||
|
origTextLen: origText?.length,
|
||||||
|
replayTextLen: replayText?.length,
|
||||||
|
},
|
||||||
|
"replay",
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const dist = levenshtein(origText, replayText);
|
||||||
|
const maxLen = Math.max(origText.length, replayText.length);
|
||||||
|
const matchPercent = (maxLen - dist) / maxLen;
|
||||||
|
logger.info("Levenshtein Dist", { url, dist, matchPercent, maxLen });
|
||||||
|
|
||||||
|
const pageInfo = this.pageInfos.get(page);
|
||||||
|
if (pageInfo) {
|
||||||
|
pageInfo.comparison.textMatch = matchPercent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async compareResources(
|
||||||
|
page: Page,
|
||||||
|
state: PageState,
|
||||||
|
url: string,
|
||||||
|
date?: Date,
|
||||||
|
) {
|
||||||
|
const origResources = await this.fetchOrigText(
|
||||||
|
page,
|
||||||
|
"pageinfo",
|
||||||
|
url,
|
||||||
|
date ? date.toISOString().replace(/[^\d]/g, "") : "",
|
||||||
|
);
|
||||||
|
|
||||||
|
let origResData: PageInfoRecord | null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
origResData = JSON.parse(origResources || "");
|
||||||
|
} catch (e) {
|
||||||
|
origResData = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pageInfo: ReplayPageInfoRecord | undefined = this.pageInfos.get(page);
|
||||||
|
|
||||||
|
if (!origResData) {
|
||||||
|
logger.warn("Original resources missing / invalid", { url }, "replay");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!pageInfo) {
|
||||||
|
logger.warn("Replay resources missing / invalid", { url }, "replay");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (origResData.ts) {
|
||||||
|
pageInfo.ts = origResData.ts;
|
||||||
|
}
|
||||||
|
|
||||||
|
const { resourceCounts } = pageInfo.comparison;
|
||||||
|
|
||||||
|
const { good: crawlGood, bad: crawlBad } = this.countResources(origResData);
|
||||||
|
const { good: replayGood, bad: replayBad } = this.countResources(pageInfo);
|
||||||
|
|
||||||
|
resourceCounts.crawlGood = crawlGood;
|
||||||
|
resourceCounts.crawlBad = crawlBad;
|
||||||
|
resourceCounts.replayGood = replayGood;
|
||||||
|
resourceCounts.replayBad = replayBad;
|
||||||
|
|
||||||
|
logger.info("Resource counts", { url, ...resourceCounts }, "replay");
|
||||||
|
}
|
||||||
|
|
||||||
|
countResources(info: PageInfoRecord) {
|
||||||
|
let good = 0;
|
||||||
|
let bad = 0;
|
||||||
|
|
||||||
|
for (const [url, { status }] of Object.entries(info.urls)) {
|
||||||
|
if (!url.startsWith("http")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (url.indexOf("__wb_method") !== -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (status >= 400) {
|
||||||
|
bad++;
|
||||||
|
} else {
|
||||||
|
good++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { bad, good };
|
||||||
|
}
|
||||||
|
|
||||||
|
async fetchOrigBinary(page: Page, type: string, url: string, ts: string) {
|
||||||
|
const frame = page.frames()[1];
|
||||||
|
if (!frame) {
|
||||||
|
logger.warn("Replay frame missing", { url }, "replay");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const replayUrl = REPLAY_PREFIX + `${ts}mp_/urn:${type}:${url}`;
|
||||||
|
|
||||||
|
const binaryString = await frame.evaluate(async (url) => {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: "GET",
|
||||||
|
credentials: "include",
|
||||||
|
});
|
||||||
|
if (response.status !== 200) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
const blob = await response.blob();
|
||||||
|
const result = new Promise((resolve, reject) => {
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onloadend = () => resolve(reader.result);
|
||||||
|
reader.onerror = reject;
|
||||||
|
reader.readAsBinaryString(blob);
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}, replayUrl);
|
||||||
|
|
||||||
|
if (!binaryString) {
|
||||||
|
logger.warn("Couldn't fetch original data", { type, url, ts }, "replay");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Buffer.from(binaryString as string, "binary");
|
||||||
|
}
|
||||||
|
|
||||||
|
async fetchOrigText(page: Page, type: string, url: string, ts: string) {
|
||||||
|
const frame = page.frames()[1];
|
||||||
|
if (!frame) {
|
||||||
|
logger.warn("Replay frame missing", { url }, "replay");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const replayUrl = REPLAY_PREFIX + `${ts}mp_/urn:${type}:${url}`;
|
||||||
|
|
||||||
|
const text = await frame.evaluate(async (url) => {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: "GET",
|
||||||
|
credentials: "include",
|
||||||
|
});
|
||||||
|
if (response.status !== 200) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return await response.text();
|
||||||
|
}, replayUrl);
|
||||||
|
|
||||||
|
if (!text) {
|
||||||
|
logger.warn("Couldn't fetch original data", { type, url, ts }, "replay");
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
async teardownPage(opts: WorkerOpts) {
|
||||||
|
const { page } = opts;
|
||||||
|
await this.processPageInfo(page);
|
||||||
|
await super.teardownPage(opts);
|
||||||
|
}
|
||||||
|
|
||||||
|
async processPageInfo(page: Page, state?: PageState) {
|
||||||
|
const pageInfo = this.pageInfos.get(page);
|
||||||
|
if (pageInfo) {
|
||||||
|
if (!pageInfo.urls[pageInfo.url]) {
|
||||||
|
logger.warn(
|
||||||
|
"Replay resource: missing top-level page",
|
||||||
|
{ url: pageInfo.url },
|
||||||
|
"replay",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state) {
|
||||||
|
const { comparison } = pageInfo;
|
||||||
|
|
||||||
|
// add comparison to page state
|
||||||
|
(state as ComparisonPageState).comparison = comparison;
|
||||||
|
}
|
||||||
|
|
||||||
|
const writer = new WARCResourceWriter({
|
||||||
|
url: pageInfo.url,
|
||||||
|
directory: this.archivesDir,
|
||||||
|
warcPrefix: this.warcPrefix,
|
||||||
|
date: new Date(),
|
||||||
|
warcName: "info.warc.gz",
|
||||||
|
});
|
||||||
|
await writer.writeBufferToWARC(
|
||||||
|
new TextEncoder().encode(JSON.stringify(pageInfo, null, 2)),
|
||||||
|
"pageinfo",
|
||||||
|
"application/json",
|
||||||
|
);
|
||||||
|
this.pageInfos.delete(page);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected pageEntryForRedis(
|
||||||
|
entry: Record<string, string | number | boolean | object>,
|
||||||
|
state: PageState,
|
||||||
|
) {
|
||||||
|
entry.comparison = (state as ComparisonPageState).comparison;
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
createRecorder(): Recorder | null {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class WACZLoader {
|
||||||
|
url: string;
|
||||||
|
zipreader: ZipRangeReader;
|
||||||
|
|
||||||
|
constructor(url: string) {
|
||||||
|
this.url = url;
|
||||||
|
this.zipreader = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async init() {
|
||||||
|
if (!this.url.startsWith("http://") && !this.url.startsWith("https://")) {
|
||||||
|
const blob = await openAsBlob(this.url);
|
||||||
|
this.url = URL.createObjectURL(blob);
|
||||||
|
}
|
||||||
|
|
||||||
|
const loader = await createLoader({ url: this.url });
|
||||||
|
|
||||||
|
this.zipreader = new ZipRangeReader(loader);
|
||||||
|
}
|
||||||
|
|
||||||
|
async loadFile(fileInZip: string) {
|
||||||
|
const { reader } = await this.zipreader.loadFile(fileInZip);
|
||||||
|
|
||||||
|
if (!reader) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!reader.iterLines) {
|
||||||
|
return new AsyncIterReader(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
}
|
|
@ -536,10 +536,21 @@ class ArgParser {
|
||||||
choices: SERVICE_WORKER_OPTS,
|
choices: SERVICE_WORKER_OPTS,
|
||||||
default: "disabled",
|
default: "disabled",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
qaSource: {
|
||||||
|
describe: "Required for QA mode. Source (WACZ or multi WACZ) for QA",
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
|
||||||
|
qaDebugImageDiff: {
|
||||||
|
describe:
|
||||||
|
"if specified, will write crawl.png, replay.png and diff.png for each page where they're different",
|
||||||
|
type: "boolean",
|
||||||
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
parseArgs(argvParams?: string[]) {
|
parseArgs(argvParams?: string[], isQA = false) {
|
||||||
let argv = argvParams || process.argv;
|
let argv = argvParams || process.argv;
|
||||||
|
|
||||||
if (process.env.CRAWL_ARGS) {
|
if (process.env.CRAWL_ARGS) {
|
||||||
|
@ -563,7 +574,7 @@ class ArgParser {
|
||||||
return origConfig;
|
return origConfig;
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
.check((argv) => this.validateArgs(argv)).argv;
|
.check((argv) => this.validateArgs(argv, isQA)).argv;
|
||||||
|
|
||||||
return { parsed, origConfig };
|
return { parsed, origConfig };
|
||||||
}
|
}
|
||||||
|
@ -576,7 +587,7 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
validateArgs(argv: Record<string, any>) {
|
validateArgs(argv: Record<string, any>, isQA: boolean) {
|
||||||
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
|
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
|
||||||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||||
|
|
||||||
|
@ -631,33 +642,39 @@ class ArgParser {
|
||||||
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const scopeOpts = {
|
|
||||||
scopeType: argv.scopeType,
|
|
||||||
sitemap: argv.sitemap,
|
|
||||||
include: argv.include,
|
|
||||||
exclude: argv.exclude,
|
|
||||||
depth: argv.depth,
|
|
||||||
extraHops: argv.extraHops,
|
|
||||||
};
|
|
||||||
|
|
||||||
argv.scopedSeeds = [];
|
argv.scopedSeeds = [];
|
||||||
|
|
||||||
for (let seed of argv.seeds) {
|
if (!isQA) {
|
||||||
if (typeof seed === "string") {
|
const scopeOpts = {
|
||||||
seed = { url: seed };
|
scopeType: argv.scopeType,
|
||||||
}
|
sitemap: argv.sitemap,
|
||||||
|
include: argv.include,
|
||||||
|
exclude: argv.exclude,
|
||||||
|
depth: argv.depth,
|
||||||
|
extraHops: argv.extraHops,
|
||||||
|
};
|
||||||
|
|
||||||
try {
|
for (let seed of argv.seeds) {
|
||||||
argv.scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...seed }));
|
if (typeof seed === "string") {
|
||||||
} catch (e) {
|
seed = { url: seed };
|
||||||
if (argv.failOnFailedSeed) {
|
}
|
||||||
logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`);
|
|
||||||
|
try {
|
||||||
|
argv.scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...seed }));
|
||||||
|
} catch (e) {
|
||||||
|
if (argv.failOnFailedSeed) {
|
||||||
|
logger.fatal(
|
||||||
|
`Invalid Seed "${seed.url}" specified, aborting crawl.`,
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (!argv.scopedSeeds.length) {
|
if (!argv.scopedSeeds.length) {
|
||||||
logger.fatal("No valid seeds specified, aborting crawl.");
|
logger.fatal("No valid seeds specified, aborting crawl.");
|
||||||
|
}
|
||||||
|
} else if (!argv.qaSource) {
|
||||||
|
logger.fatal("--qaSource required for QA mode!");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resolve statsFilename
|
// Resolve statsFilename
|
||||||
|
@ -673,6 +690,6 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function parseArgs(argv?: string[]) {
|
export function parseArgs(argv?: string[], isQA = false) {
|
||||||
return new ArgParser().parseArgs(argv);
|
return new ArgParser().parseArgs(argv, isQA);
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,22 +19,27 @@ import puppeteer, {
|
||||||
Viewport,
|
Viewport,
|
||||||
} from "puppeteer-core";
|
} from "puppeteer-core";
|
||||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||||
|
import { Recorder } from "./recorder.js";
|
||||||
|
|
||||||
|
type BtrixChromeOpts = {
|
||||||
|
proxy?: boolean;
|
||||||
|
userAgent?: string | null;
|
||||||
|
extraArgs?: string[];
|
||||||
|
};
|
||||||
|
|
||||||
type LaunchOpts = {
|
type LaunchOpts = {
|
||||||
profileUrl: string;
|
profileUrl: string;
|
||||||
// TODO: Fix this the next time the file is edited.
|
chromeOptions: BtrixChromeOpts;
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
chromeOptions: Record<string, any>;
|
|
||||||
signals: boolean;
|
signals: boolean;
|
||||||
headless: boolean;
|
headless: boolean;
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
emulateDevice?: Record<string, any>;
|
emulateDevice?: Record<string, any>;
|
||||||
// TODO: Fix this the next time the file is edited.
|
ondisconnect?: ((err: unknown) => NonNullable<unknown>) | null;
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
ondisconnect?: ((err: any) => NonNullable<unknown>) | null;
|
|
||||||
|
|
||||||
swOpt?: ServiceWorkerOpt;
|
swOpt?: ServiceWorkerOpt;
|
||||||
|
|
||||||
|
recording: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ==================================================================
|
// ==================================================================
|
||||||
|
@ -48,9 +53,7 @@ export class Browser {
|
||||||
browser?: PptrBrowser | null = null;
|
browser?: PptrBrowser | null = null;
|
||||||
firstCDP: CDPSession | null = null;
|
firstCDP: CDPSession | null = null;
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
recorders: Recorder[] = [];
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
recorders: any[] = [];
|
|
||||||
|
|
||||||
swOpt?: ServiceWorkerOpt = "disabled";
|
swOpt?: ServiceWorkerOpt = "disabled";
|
||||||
|
|
||||||
|
@ -66,6 +69,7 @@ export class Browser {
|
||||||
emulateDevice = {},
|
emulateDevice = {},
|
||||||
swOpt = "disabled",
|
swOpt = "disabled",
|
||||||
ondisconnect = null,
|
ondisconnect = null,
|
||||||
|
recording = true,
|
||||||
}: LaunchOpts) {
|
}: LaunchOpts) {
|
||||||
if (this.isLaunched()) {
|
if (this.isLaunched()) {
|
||||||
return;
|
return;
|
||||||
|
@ -105,7 +109,7 @@ export class Browser {
|
||||||
userDataDir: this.profileDir,
|
userDataDir: this.profileDir,
|
||||||
};
|
};
|
||||||
|
|
||||||
await this._init(launchOpts, ondisconnect);
|
await this._init(launchOpts, ondisconnect, recording);
|
||||||
}
|
}
|
||||||
|
|
||||||
async setupPage({ page }: { page: Page; cdp: CDPSession }) {
|
async setupPage({ page }: { page: Page; cdp: CDPSession }) {
|
||||||
|
@ -116,13 +120,13 @@ export class Browser {
|
||||||
|
|
||||||
switch (this.swOpt) {
|
switch (this.swOpt) {
|
||||||
case "disabled":
|
case "disabled":
|
||||||
logger.info("Service Workers: always disabled", {}, "browser");
|
logger.debug("Service Workers: always disabled", {}, "browser");
|
||||||
await page.setBypassServiceWorker(true);
|
await page.setBypassServiceWorker(true);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "disabled-if-profile":
|
case "disabled-if-profile":
|
||||||
if (this.customProfile) {
|
if (this.customProfile) {
|
||||||
logger.info(
|
logger.debug(
|
||||||
"Service Workers: disabled since using profile",
|
"Service Workers: disabled since using profile",
|
||||||
{},
|
{},
|
||||||
"browser",
|
"browser",
|
||||||
|
@ -132,7 +136,7 @@ export class Browser {
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "enabled":
|
case "enabled":
|
||||||
logger.info("Service Workers: always enabled", {}, "browser");
|
logger.debug("Service Workers: always enabled", {}, "browser");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -195,7 +199,11 @@ export class Browser {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
chromeArgs({ proxy = true, userAgent = null, extraArgs = [] } = {}) {
|
chromeArgs({
|
||||||
|
proxy = true,
|
||||||
|
userAgent = null,
|
||||||
|
extraArgs = [],
|
||||||
|
}: BtrixChromeOpts) {
|
||||||
// Chrome Flags, including proxy server
|
// Chrome Flags, including proxy server
|
||||||
const args = [
|
const args = [
|
||||||
// eslint-disable-next-line no-use-before-define
|
// eslint-disable-next-line no-use-before-define
|
||||||
|
@ -347,6 +355,7 @@ export class Browser {
|
||||||
launchOpts: PuppeteerLaunchOptions,
|
launchOpts: PuppeteerLaunchOptions,
|
||||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||||
ondisconnect: Function | null = null,
|
ondisconnect: Function | null = null,
|
||||||
|
recording: boolean,
|
||||||
) {
|
) {
|
||||||
this.browser = await puppeteer.launch(launchOpts);
|
this.browser = await puppeteer.launch(launchOpts);
|
||||||
|
|
||||||
|
@ -354,7 +363,9 @@ export class Browser {
|
||||||
|
|
||||||
this.firstCDP = await target.createCDPSession();
|
this.firstCDP = await target.createCDPSession();
|
||||||
|
|
||||||
await this.serviceWorkerFetch();
|
if (recording) {
|
||||||
|
await this.serviceWorkerFetch();
|
||||||
|
}
|
||||||
|
|
||||||
if (ondisconnect) {
|
if (ondisconnect) {
|
||||||
this.browser.on("disconnected", (err) => ondisconnect(err));
|
this.browser.on("disconnected", (err) => ondisconnect(err));
|
||||||
|
@ -497,8 +508,6 @@ export class Browser {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
|
||||||
|
|
||||||
async evaluateWithCLI(
|
async evaluateWithCLI(
|
||||||
_: unknown,
|
_: unknown,
|
||||||
frame: Frame,
|
frame: Frame,
|
||||||
|
|
|
@ -48,6 +48,7 @@ export const LOG_CONTEXT_TYPES = [
|
||||||
"crawlStatus",
|
"crawlStatus",
|
||||||
"links",
|
"links",
|
||||||
"sitemap",
|
"sitemap",
|
||||||
|
"replay",
|
||||||
] as const;
|
] as const;
|
||||||
|
|
||||||
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
|
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
|
||||||
|
|
146
src/util/replayserver.ts
Normal file
146
src/util/replayserver.ts
Normal file
|
@ -0,0 +1,146 @@
|
||||||
|
import fs from "fs";
|
||||||
|
import fsp from "fs/promises";
|
||||||
|
import http, { IncomingMessage, ServerResponse } from "http";
|
||||||
|
import path from "path";
|
||||||
|
|
||||||
|
const replayHTML = fs.readFileSync(
|
||||||
|
new URL("../../html/replay.html", import.meta.url),
|
||||||
|
{ encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
|
const swJS = fs.readFileSync(new URL("../../html/rwp/sw.js", import.meta.url), {
|
||||||
|
encoding: "utf8",
|
||||||
|
});
|
||||||
|
|
||||||
|
const uiJS = fs.readFileSync(new URL("../../html/rwp/ui.js", import.meta.url), {
|
||||||
|
encoding: "utf8",
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
const PORT = 9990;
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
export class ReplayServer {
|
||||||
|
sourceUrl: string;
|
||||||
|
origFileSource: string | null;
|
||||||
|
sourceContentType: string | null;
|
||||||
|
sourceSize?: number;
|
||||||
|
|
||||||
|
constructor(sourceUrlOrFile: string) {
|
||||||
|
if (
|
||||||
|
sourceUrlOrFile.startsWith("http://") ||
|
||||||
|
sourceUrlOrFile.startsWith("https://")
|
||||||
|
) {
|
||||||
|
this.sourceUrl = sourceUrlOrFile;
|
||||||
|
this.origFileSource = null;
|
||||||
|
this.sourceContentType = null;
|
||||||
|
} else {
|
||||||
|
this.origFileSource = sourceUrlOrFile;
|
||||||
|
const ext = path.extname(sourceUrlOrFile);
|
||||||
|
this.sourceUrl = `/source${ext}`;
|
||||||
|
|
||||||
|
switch (ext) {
|
||||||
|
case ".wacz":
|
||||||
|
this.sourceContentType = "application/wacz+zip";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ".json":
|
||||||
|
this.sourceContentType = "application/json";
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
this.sourceContentType = "application/octet-stream";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const httpServer = http.createServer((req, res) =>
|
||||||
|
this.handleRequest(req, res),
|
||||||
|
);
|
||||||
|
httpServer.listen(PORT);
|
||||||
|
}
|
||||||
|
|
||||||
|
get homePage() {
|
||||||
|
return `http://localhost:${PORT}/`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async handleRequest(request: IncomingMessage, response: ServerResponse) {
|
||||||
|
const parsedUrl = new URL(
|
||||||
|
request.url || "",
|
||||||
|
`http://${request.headers.host}`,
|
||||||
|
);
|
||||||
|
const pathname = parsedUrl.pathname;
|
||||||
|
|
||||||
|
switch (pathname) {
|
||||||
|
case "/":
|
||||||
|
response.writeHead(200, { "Content-Type": "text/html" });
|
||||||
|
response.end(replayHTML.replace("$SOURCE", this.sourceUrl));
|
||||||
|
return;
|
||||||
|
|
||||||
|
case "/sw.js":
|
||||||
|
case "/sw.js?serveIndex=1":
|
||||||
|
case "/replay/sw.js":
|
||||||
|
case "/replay/sw.js?serveIndex=1":
|
||||||
|
response.writeHead(200, { "Content-Type": "application/javascript" });
|
||||||
|
response.end(swJS);
|
||||||
|
return;
|
||||||
|
|
||||||
|
case "/ui.js":
|
||||||
|
response.writeHead(200, { "Content-Type": "application/javascript" });
|
||||||
|
response.end(uiJS);
|
||||||
|
return;
|
||||||
|
|
||||||
|
case this.sourceUrl:
|
||||||
|
if (this.sourceContentType && this.origFileSource) {
|
||||||
|
if (!this.sourceSize) {
|
||||||
|
const { size } = await fsp.stat(this.origFileSource);
|
||||||
|
this.sourceSize = size;
|
||||||
|
}
|
||||||
|
const { opts, status, contentRange, contentLength } =
|
||||||
|
this.getRespOptsForRequest(request, this.sourceSize);
|
||||||
|
response.writeHead(status, {
|
||||||
|
"Accept-Ranges": "bytes",
|
||||||
|
"Content-Type": this.sourceContentType,
|
||||||
|
"Content-Length": contentLength,
|
||||||
|
"Content-Range": contentRange,
|
||||||
|
});
|
||||||
|
//console.log(request.method, contentRange, opts);
|
||||||
|
if (request.method === "GET") {
|
||||||
|
fs.createReadStream(this.origFileSource, opts).pipe(response);
|
||||||
|
} else {
|
||||||
|
response.end();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// falls through
|
||||||
|
|
||||||
|
default:
|
||||||
|
response.writeHead(404, { "Content-Type": "application/json" });
|
||||||
|
response.end(JSON.stringify({ error: "not_found" }));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getRespOptsForRequest(request: IncomingMessage, total: number) {
|
||||||
|
const range = request.headers["range"] || "";
|
||||||
|
const array = range.match(/bytes=(\d+)-(\d*)/);
|
||||||
|
let contentRange = undefined;
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const opts: Record<string, any> = {};
|
||||||
|
if (array) {
|
||||||
|
opts.start = parseInt(array[1]);
|
||||||
|
opts.end = parseInt(array[2]);
|
||||||
|
if (isNaN(opts.end)) {
|
||||||
|
opts.end = undefined;
|
||||||
|
}
|
||||||
|
const end = opts.end || total - 1;
|
||||||
|
contentRange = `bytes ${opts.start}-${end}/${total}`;
|
||||||
|
return {
|
||||||
|
status: 206,
|
||||||
|
opts,
|
||||||
|
contentRange,
|
||||||
|
contentLength: end - opts.start + 1,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { status: 200, opts, contentRange, contentLength: total };
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,48 +3,63 @@ import sharp from "sharp";
|
||||||
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
||||||
import { logger, formatErr } from "./logger.js";
|
import { logger, formatErr } from "./logger.js";
|
||||||
import { Browser } from "./browser.js";
|
import { Browser } from "./browser.js";
|
||||||
|
import { Page } from "puppeteer-core";
|
||||||
|
import { PageState } from "./state.js";
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
type ScreenShotType = {
|
type ScreenShotDesc = {
|
||||||
type: string;
|
type: "png" | "jpeg";
|
||||||
omitBackground: boolean;
|
omitBackground: boolean;
|
||||||
fullPage: boolean;
|
fullPage: boolean;
|
||||||
|
encoding: "binary";
|
||||||
};
|
};
|
||||||
|
|
||||||
export const screenshotTypes: Record<string, ScreenShotType> = {
|
type ScreeshotType = "view" | "thumbnail" | "fullPage";
|
||||||
|
|
||||||
|
export const screenshotTypes: Record<string, ScreenShotDesc> = {
|
||||||
view: {
|
view: {
|
||||||
type: "png",
|
type: "png",
|
||||||
omitBackground: true,
|
omitBackground: true,
|
||||||
fullPage: false,
|
fullPage: false,
|
||||||
|
encoding: "binary",
|
||||||
},
|
},
|
||||||
thumbnail: {
|
thumbnail: {
|
||||||
type: "jpeg",
|
type: "jpeg",
|
||||||
omitBackground: true,
|
omitBackground: true,
|
||||||
fullPage: false,
|
fullPage: false,
|
||||||
|
encoding: "binary",
|
||||||
},
|
},
|
||||||
fullPage: {
|
fullPage: {
|
||||||
type: "png",
|
type: "png",
|
||||||
omitBackground: true,
|
omitBackground: true,
|
||||||
fullPage: true,
|
fullPage: true,
|
||||||
|
encoding: "binary",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type ScreenshotOpts = {
|
||||||
|
browser: Browser;
|
||||||
|
page: Page;
|
||||||
|
url: string;
|
||||||
|
directory: string;
|
||||||
|
warcPrefix: string;
|
||||||
|
};
|
||||||
|
|
||||||
export class Screenshots extends WARCResourceWriter {
|
export class Screenshots extends WARCResourceWriter {
|
||||||
browser: Browser;
|
browser: Browser;
|
||||||
// TODO: Fix this the next time the file is edited.
|
page: Page;
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
page: any;
|
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
constructor(opts: ScreenshotOpts) {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
constructor(opts: any) {
|
|
||||||
super({ ...opts, warcName: "screenshots.warc.gz" });
|
super({ ...opts, warcName: "screenshots.warc.gz" });
|
||||||
this.browser = opts.browser;
|
this.browser = opts.browser;
|
||||||
this.page = opts.page;
|
this.page = opts.page;
|
||||||
}
|
}
|
||||||
|
|
||||||
async take(screenshotType = "view") {
|
async take(
|
||||||
|
screenshotType: ScreeshotType = "view",
|
||||||
|
state: PageState | null = null,
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
if (screenshotType !== "fullPage") {
|
if (screenshotType !== "fullPage") {
|
||||||
await this.browser.setViewport(this.page, {
|
await this.browser.setViewport(this.page, {
|
||||||
|
@ -54,6 +69,9 @@ export class Screenshots extends WARCResourceWriter {
|
||||||
}
|
}
|
||||||
const options = screenshotTypes[screenshotType];
|
const options = screenshotTypes[screenshotType];
|
||||||
const screenshotBuffer = await this.page.screenshot(options);
|
const screenshotBuffer = await this.page.screenshot(options);
|
||||||
|
if (state && screenshotType === "view") {
|
||||||
|
state.screenshotView = screenshotBuffer;
|
||||||
|
}
|
||||||
await this.writeBufferToWARC(
|
await this.writeBufferToWARC(
|
||||||
screenshotBuffer,
|
screenshotBuffer,
|
||||||
screenshotType,
|
screenshotType,
|
||||||
|
|
|
@ -33,6 +33,8 @@ export type QueueEntry = {
|
||||||
seedId: number;
|
seedId: number;
|
||||||
depth: number;
|
depth: number;
|
||||||
extraHops: number;
|
extraHops: number;
|
||||||
|
ts?: number;
|
||||||
|
pageid?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -66,6 +68,7 @@ export class PageState {
|
||||||
|
|
||||||
isHTMLPage?: boolean;
|
isHTMLPage?: boolean;
|
||||||
text?: string;
|
text?: string;
|
||||||
|
screenshotView?: Buffer;
|
||||||
favicon?: string;
|
favicon?: string;
|
||||||
|
|
||||||
skipBehaviors = false;
|
skipBehaviors = false;
|
||||||
|
@ -79,7 +82,10 @@ export class PageState {
|
||||||
this.seedId = redisData.seedId;
|
this.seedId = redisData.seedId;
|
||||||
this.depth = redisData.depth;
|
this.depth = redisData.depth;
|
||||||
this.extraHops = redisData.extraHops || 0;
|
this.extraHops = redisData.extraHops || 0;
|
||||||
this.pageid = uuidv4();
|
if (redisData.ts) {
|
||||||
|
this.ts = new Date(redisData.ts);
|
||||||
|
}
|
||||||
|
this.pageid = redisData.pageid || uuidv4();
|
||||||
this.status = 0;
|
this.status = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -472,12 +478,26 @@ return 0;
|
||||||
|
|
||||||
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
||||||
async addToQueue(
|
async addToQueue(
|
||||||
{ url, seedId, depth = 0, extraHops = 0 }: QueueEntry,
|
{
|
||||||
|
url,
|
||||||
|
seedId,
|
||||||
|
depth = 0,
|
||||||
|
extraHops = 0,
|
||||||
|
ts = 0,
|
||||||
|
pageid = undefined,
|
||||||
|
}: QueueEntry,
|
||||||
limit = 0,
|
limit = 0,
|
||||||
) {
|
) {
|
||||||
const added = this._timestamp();
|
const added = this._timestamp();
|
||||||
const data: QueueEntry = { added, url, seedId, depth, extraHops };
|
const data: QueueEntry = { added, url, seedId, depth, extraHops };
|
||||||
|
|
||||||
|
if (ts) {
|
||||||
|
data.ts = ts;
|
||||||
|
}
|
||||||
|
if (pageid) {
|
||||||
|
data.pageid = pageid;
|
||||||
|
}
|
||||||
|
|
||||||
// return codes
|
// return codes
|
||||||
// 0 - url queued successfully
|
// 0 - url queued successfully
|
||||||
// 1 - url queue size limit reached
|
// 1 - url queue size limit reached
|
||||||
|
|
|
@ -2,17 +2,25 @@ import { WARCResourceWriter } from "./warcresourcewriter.js";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
import { CDPSession, Protocol } from "puppeteer-core";
|
import { CDPSession, Protocol } from "puppeteer-core";
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
type TextExtractOpts = {
|
||||||
|
url: string;
|
||||||
|
directory: string;
|
||||||
|
warcPrefix: string;
|
||||||
|
skipDocs: number;
|
||||||
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export abstract class BaseTextExtract extends WARCResourceWriter {
|
export abstract class BaseTextExtract extends WARCResourceWriter {
|
||||||
cdp: CDPSession;
|
cdp: CDPSession;
|
||||||
lastText: string | null = null;
|
lastText: string | null = null;
|
||||||
text: string | null = null;
|
text: string | null = null;
|
||||||
|
skipDocs: number = 0;
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
constructor(cdp: CDPSession, opts: TextExtractOpts) {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
constructor(cdp: CDPSession, opts: any) {
|
|
||||||
super({ ...opts, warcName: "text.warc.gz" });
|
super({ ...opts, warcName: "text.warc.gz" });
|
||||||
this.cdp = cdp;
|
this.cdp = cdp;
|
||||||
|
this.skipDocs = opts.skipDocs || 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
async extractAndStoreText(
|
async extractAndStoreText(
|
||||||
|
@ -83,7 +91,7 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||||
|
|
||||||
const accum: string[] = [];
|
const accum: string[] = [];
|
||||||
|
|
||||||
for (const doc of documents) {
|
for (const doc of documents.slice(this.skipDocs)) {
|
||||||
const nodeValues = doc.nodes.nodeValue || [];
|
const nodeValues = doc.nodes.nodeValue || [];
|
||||||
const nodeNames = doc.nodes.nodeName || [];
|
const nodeNames = doc.nodes.nodeName || [];
|
||||||
const nodeTypes = doc.nodes.nodeType || [];
|
const nodeTypes = doc.nodes.nodeType || [];
|
||||||
|
|
|
@ -2,10 +2,17 @@ import fs from "fs";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import * as warcio from "warcio";
|
import * as warcio from "warcio";
|
||||||
|
|
||||||
|
// ===========================================================================
|
||||||
|
export type WARCResourceWriterOpts = {
|
||||||
|
url: string;
|
||||||
|
directory: string;
|
||||||
|
date?: Date;
|
||||||
|
warcName: string;
|
||||||
|
warcPrefix: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ===========================================================================
|
||||||
export class WARCResourceWriter {
|
export class WARCResourceWriter {
|
||||||
// TODO: Fix this the next time the file is edited.
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
page: any;
|
|
||||||
url: string;
|
url: string;
|
||||||
directory: string;
|
directory: string;
|
||||||
warcName: string;
|
warcName: string;
|
||||||
|
@ -17,13 +24,7 @@ export class WARCResourceWriter {
|
||||||
date,
|
date,
|
||||||
warcPrefix,
|
warcPrefix,
|
||||||
warcName,
|
warcName,
|
||||||
}: {
|
}: WARCResourceWriterOpts) {
|
||||||
url: string;
|
|
||||||
directory: string;
|
|
||||||
date: Date;
|
|
||||||
warcPrefix: string;
|
|
||||||
warcName: string;
|
|
||||||
}) {
|
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.directory = directory;
|
this.directory = directory;
|
||||||
this.warcName = path.join(this.directory, warcPrefix + warcName);
|
this.warcName = path.join(this.directory, warcPrefix + warcName);
|
||||||
|
|
|
@ -14,7 +14,6 @@ const NEW_WINDOW_TIMEOUT = 20;
|
||||||
const TEARDOWN_TIMEOUT = 10;
|
const TEARDOWN_TIMEOUT = 10;
|
||||||
const FINISHED_TIMEOUT = 60;
|
const FINISHED_TIMEOUT = 60;
|
||||||
|
|
||||||
// ===========================================================================
|
|
||||||
export type WorkerOpts = {
|
export type WorkerOpts = {
|
||||||
page: Page;
|
page: Page;
|
||||||
cdp: CDPSession;
|
cdp: CDPSession;
|
||||||
|
@ -39,6 +38,7 @@ export class PageWorker {
|
||||||
maxPageTime: number;
|
maxPageTime: number;
|
||||||
|
|
||||||
reuseCount = 0;
|
reuseCount = 0;
|
||||||
|
alwaysReuse: boolean;
|
||||||
page?: Page | null;
|
page?: Page | null;
|
||||||
cdp?: CDPSession | null;
|
cdp?: CDPSession | null;
|
||||||
|
|
||||||
|
@ -55,27 +55,22 @@ export class PageWorker {
|
||||||
markCrashed?: (reason: string) => void;
|
markCrashed?: (reason: string) => void;
|
||||||
crashBreak?: Promise<void>;
|
crashBreak?: Promise<void>;
|
||||||
|
|
||||||
recorder: Recorder;
|
recorder: Recorder | null;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
id: WorkerId,
|
id: WorkerId,
|
||||||
crawler: Crawler,
|
crawler: Crawler,
|
||||||
maxPageTime: number,
|
maxPageTime: number,
|
||||||
collDir: string,
|
alwaysReuse = false,
|
||||||
) {
|
) {
|
||||||
this.id = id;
|
this.id = id;
|
||||||
this.crawler = crawler;
|
this.crawler = crawler;
|
||||||
this.maxPageTime = maxPageTime;
|
this.maxPageTime = maxPageTime;
|
||||||
|
this.alwaysReuse = alwaysReuse;
|
||||||
|
|
||||||
this.logDetails = { workerid: this.id };
|
this.logDetails = { workerid: this.id };
|
||||||
|
|
||||||
this.recorder = new Recorder({
|
this.recorder = this.crawler.createRecorder(this.id);
|
||||||
workerid: id,
|
|
||||||
collDir,
|
|
||||||
crawler: this.crawler,
|
|
||||||
});
|
|
||||||
|
|
||||||
this.crawler.browser.recorders.push(this.recorder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async closePage() {
|
async closePage() {
|
||||||
|
@ -133,19 +128,18 @@ export class PageWorker {
|
||||||
}
|
}
|
||||||
|
|
||||||
async initPage(url: string): Promise<WorkerOpts> {
|
async initPage(url: string): Promise<WorkerOpts> {
|
||||||
if (
|
let reuse = !this.crashed && !!this.opts && !!this.page;
|
||||||
!this.crashed &&
|
if (!this.alwaysReuse) {
|
||||||
this.page &&
|
++this.reuseCount;
|
||||||
this.opts &&
|
reuse = this.reuseCount <= MAX_REUSE && this.isSameOrigin(url);
|
||||||
++this.reuseCount <= MAX_REUSE &&
|
}
|
||||||
this.isSameOrigin(url)
|
if (reuse) {
|
||||||
) {
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Reusing page",
|
"Reusing page",
|
||||||
{ reuseCount: this.reuseCount, ...this.logDetails },
|
{ reuseCount: this.reuseCount, ...this.logDetails },
|
||||||
"worker",
|
"worker",
|
||||||
);
|
);
|
||||||
return this.opts;
|
return this.opts!;
|
||||||
} else if (this.page) {
|
} else if (this.page) {
|
||||||
await this.closePage();
|
await this.closePage();
|
||||||
}
|
}
|
||||||
|
@ -176,7 +170,7 @@ export class PageWorker {
|
||||||
this.cdp = cdp;
|
this.cdp = cdp;
|
||||||
this.callbacks = {};
|
this.callbacks = {};
|
||||||
const directFetchCapture = this.recorder
|
const directFetchCapture = this.recorder
|
||||||
? (x: string) => this.recorder.directFetchCapture(x)
|
? (x: string) => this.recorder!.directFetchCapture(x)
|
||||||
: null;
|
: null;
|
||||||
this.opts = {
|
this.opts = {
|
||||||
page,
|
page,
|
||||||
|
@ -405,10 +399,11 @@ export async function runWorkers(
|
||||||
crawler: Crawler,
|
crawler: Crawler,
|
||||||
numWorkers: number,
|
numWorkers: number,
|
||||||
maxPageTime: number,
|
maxPageTime: number,
|
||||||
collDir: string,
|
alwaysReuse = false,
|
||||||
) {
|
) {
|
||||||
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
||||||
|
|
||||||
|
const workers = [];
|
||||||
let offset = 0;
|
let offset = 0;
|
||||||
|
|
||||||
// automatically set worker start by ordinal in k8s
|
// automatically set worker start by ordinal in k8s
|
||||||
|
@ -426,7 +421,7 @@ export async function runWorkers(
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let i = 0; i < numWorkers; i++) {
|
for (let i = 0; i < numWorkers; i++) {
|
||||||
workers.push(new PageWorker(i + offset, crawler, maxPageTime, collDir));
|
workers.push(new PageWorker(i + offset, crawler, maxPageTime, alwaysReuse));
|
||||||
}
|
}
|
||||||
|
|
||||||
await Promise.allSettled(workers.map((worker) => worker.run()));
|
await Promise.allSettled(workers.map((worker) => worker.run()));
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
import { exec } from "child_process";
|
import { exec } from "child_process";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
|
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
test("dynamically add exclusion while crawl is running", async () => {
|
test("dynamically add exclusion while crawl is running", async () => {
|
||||||
let callback = null;
|
let callback = null;
|
||||||
|
|
||||||
|
@ -12,7 +16,7 @@ test("dynamically add exclusion while crawl is running", async () => {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
exec(
|
exec(
|
||||||
"docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
|
"docker run -p 36382:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
|
||||||
{ shell: "/bin/bash" },
|
{ shell: "/bin/bash" },
|
||||||
callback,
|
callback,
|
||||||
);
|
);
|
||||||
|
@ -20,18 +24,18 @@ test("dynamically add exclusion while crawl is running", async () => {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 3000));
|
await sleep(3000);
|
||||||
|
|
||||||
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true });
|
const redis = new Redis("redis://127.0.0.1:36382/0", { lazyConnect: true, retryStrategy: () => null })
|
||||||
|
|
||||||
await redis.connect({ maxRetriesPerRequest: 50 });
|
await redis.connect();
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (Number(await redis.zcard("test:q")) > 1) {
|
if (Number(await redis.zcard("test:q")) > 1) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
await sleep(500);
|
||||||
}
|
}
|
||||||
|
|
||||||
const uids = await redis.hkeys("test:status");
|
const uids = await redis.hkeys("test:status");
|
||||||
|
@ -48,6 +52,5 @@ test("dynamically add exclusion while crawl is running", async () => {
|
||||||
expect(stdout.indexOf("Add Exclusion") > 0).toBe(true);
|
expect(stdout.indexOf("Add Exclusion") > 0).toBe(true);
|
||||||
|
|
||||||
expect(stdout.indexOf("Removing excluded URL") > 0).toBe(true);
|
expect(stdout.indexOf("Removing excluded URL") > 0).toBe(true);
|
||||||
|
|
||||||
await redis.disconnect();
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
75
tests/qa_compare.test.js
Normal file
75
tests/qa_compare.test.js
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
import child_process from "child_process";
|
||||||
|
import fs from "fs";
|
||||||
|
import { Redis } from "ioredis";
|
||||||
|
|
||||||
|
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
|
||||||
|
|
||||||
|
test("run initial crawl with text and screenshots to prepare for QA", async () => {
|
||||||
|
fs.rmSync("./test-crawls/qa-wr-net", { recursive: true, force: true });
|
||||||
|
|
||||||
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://browsertrix.com/ --scopeType page --collection qa-wr-net --text to-warc --screenshot view --generateWACZ",
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/qa-wr-net/qa-wr-net.wacz"),
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("run QA comparison, with write pages to redis", async () => {
|
||||||
|
fs.rmSync("./test-crawls/qa-wr-net-replay", { recursive: true, force: true });
|
||||||
|
|
||||||
|
const child = child_process.exec(
|
||||||
|
"docker run -p 36380:6379 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler qa --qaSource /crawls/collections/qa-wr-net/qa-wr-net.wacz --collection qa-wr-net-replay --crawlId test --qaDebugImageDiff --writePagesToRedis --debugAccessRedis",
|
||||||
|
);
|
||||||
|
|
||||||
|
// detect crawler exit
|
||||||
|
let crawler_exited = false;
|
||||||
|
child.on("exit", function () {
|
||||||
|
crawler_exited = true;
|
||||||
|
});
|
||||||
|
|
||||||
|
const redis = new Redis("redis://127.0.0.1:36380/0", { lazyConnect: true, retryStrategy: () => null });
|
||||||
|
|
||||||
|
await sleep(3000);
|
||||||
|
|
||||||
|
await redis.connect({ maxRetriesPerRequest: 50 });
|
||||||
|
|
||||||
|
let count = 0;
|
||||||
|
|
||||||
|
while (count < 3) {
|
||||||
|
const res = await redis.lpop("test:pages");
|
||||||
|
if (!res) {
|
||||||
|
if (crawler_exited) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
await sleep(100);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const json = JSON.parse(res);
|
||||||
|
expect(json).toHaveProperty("id");
|
||||||
|
expect(json).toHaveProperty("url");
|
||||||
|
expect(json).toHaveProperty("ts");
|
||||||
|
expect(json).toHaveProperty("title");
|
||||||
|
expect(json).toHaveProperty("loadState");
|
||||||
|
expect(json).toHaveProperty("comparison");
|
||||||
|
|
||||||
|
expect(json.comparison).toHaveProperty("screenshotMatch");
|
||||||
|
expect(json.comparison).toHaveProperty("textMatch");
|
||||||
|
expect(json.comparison).toHaveProperty("resourceCounts");
|
||||||
|
|
||||||
|
expect(json.comparison.resourceCounts).toHaveProperty("crawlGood");
|
||||||
|
expect(json.comparison.resourceCounts).toHaveProperty("crawlBad");
|
||||||
|
expect(json.comparison.resourceCounts).toHaveProperty("replayGood");
|
||||||
|
expect(json.comparison.resourceCounts).toHaveProperty("replayBad");
|
||||||
|
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(count).toBe(3);
|
||||||
|
|
||||||
|
// wait for crawler exit
|
||||||
|
while (!crawler_exited) {
|
||||||
|
await sleep(100);
|
||||||
|
}
|
||||||
|
});
|
|
@ -117,9 +117,11 @@ test("check parsing saved state + page done + queue present", () => {
|
||||||
test("check crawl restarted with saved state", async () => {
|
test("check crawl restarted with saved state", async () => {
|
||||||
let containerId = null;
|
let containerId = null;
|
||||||
|
|
||||||
|
const port = 36379;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
containerId = execSync(
|
containerId = execSync(
|
||||||
`docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
|
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
|
||||||
{ encoding: "utf-8" },
|
{ encoding: "utf-8" },
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -128,14 +130,11 @@ test("check crawl restarted with saved state", async () => {
|
||||||
|
|
||||||
await sleep(2000);
|
await sleep(2000);
|
||||||
|
|
||||||
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true });
|
const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null });
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await redis.connect({
|
await redis.connect({
|
||||||
maxRetriesPerRequest: 100,
|
maxRetriesPerRequest: 100,
|
||||||
retryStrategy(times) {
|
|
||||||
return times < 100 ? 1000 : null;
|
|
||||||
},
|
|
||||||
});
|
});
|
||||||
|
|
||||||
await sleep(2000);
|
await sleep(2000);
|
||||||
|
@ -150,11 +149,5 @@ test("check crawl restarted with saved state", async () => {
|
||||||
console.log(e);
|
console.log(e);
|
||||||
} finally {
|
} finally {
|
||||||
await waitContainer(containerId);
|
await waitContainer(containerId);
|
||||||
|
|
||||||
try {
|
|
||||||
await redis.disconnect();
|
|
||||||
} catch (e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
|
|
||||||
|
|
||||||
function sleep(ms) {
|
function sleep(ms) {
|
||||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
@ -31,20 +30,17 @@ async function waitContainer(containerId) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function runCrawl(numExpected, url, sitemap="", limit=0) {
|
async function runCrawl(numExpected, url, sitemap="", limit=0) {
|
||||||
const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
|
const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
|
||||||
|
|
||||||
await sleep(2000);
|
await sleep(3000);
|
||||||
|
|
||||||
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true });
|
const redis = new Redis("redis://127.0.0.1:36381/0", { lazyConnect: true, retryStrategy: () => null });
|
||||||
|
|
||||||
let finished = 0;
|
let finished = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await redis.connect({
|
await redis.connect({
|
||||||
maxRetriesPerRequest: 100,
|
maxRetriesPerRequest: 100,
|
||||||
retryStrategy(times) {
|
|
||||||
return times < 100 ? 1000 : null;
|
|
||||||
},
|
|
||||||
});
|
});
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -58,11 +54,6 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) {
|
||||||
console.error(e);
|
console.error(e);
|
||||||
} finally {
|
} finally {
|
||||||
await waitContainer(containerId);
|
await waitContainer(containerId);
|
||||||
try {
|
|
||||||
await redis.disconnect();
|
|
||||||
} catch (e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(finished).toBeGreaterThanOrEqual(numExpected);
|
expect(finished).toBeGreaterThanOrEqual(numExpected);
|
||||||
|
@ -79,4 +70,3 @@ test("test sitemap with limit", async () => {
|
||||||
test("test sitemap with limit, specific URL", async () => {
|
test("test sitemap with limit, specific URL", async () => {
|
||||||
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
|
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue