TypeScript Conversion (#425)

Follows #424. Converts the upcoming 1.0.0 branch based on native browser-based traffic capture and recording to TypeScript. Fixes #426

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
Co-authored-by: emma <hi@emma.cafe>
This commit is contained in:
Ilya Kreymer 2023-11-09 11:27:11 -08:00 committed by GitHub
parent 877d9f5b44
commit af1e0860e4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
36 changed files with 2446 additions and 1406 deletions

View file

@ -1,39 +1,32 @@
module.exports = { module.exports = {
"env": { env: {
"browser": true, browser: true,
"es2021": true, es2021: true,
"node": true, node: true,
"jest": true jest: true,
}, },
"extends": "eslint:recommended", extends: ["eslint:recommended", "plugin:@typescript-eslint/recommended"],
"parserOptions": { parser: "@typescript-eslint/parser",
"ecmaVersion": 12, plugins: ["@typescript-eslint"],
"sourceType": "module" parserOptions: {
}, ecmaVersion: 12,
"rules": { sourceType: "module",
"indent": [ },
"error", rules: {
2 indent: ["error", 2],
], "linebreak-style": ["error", "unix"],
"linebreak-style": [ quotes: ["error", "double"],
"error", semi: ["error", "always"],
"unix" "no-constant-condition": ["error", { checkLoops: false }],
], "no-use-before-define": [
"quotes": [ "error",
"error", {
"double" variables: true,
], functions: false,
"semi": [ classes: false,
"error", allowNamedExports: true,
"always" },
], ],
"no-constant-condition": [ },
"error", reportUnusedDisableDirectives: true,
{"checkLoops": false }
],
"no-use-before-define": [
"error",
{"variables": true, "functions": false, "classes": false, "allowNamedExports": true}
]
}
}; };

View file

@ -40,6 +40,8 @@ jobs:
node-version: ${{ matrix.node-version }} node-version: ${{ matrix.node-version }}
- name: install requirements - name: install requirements
run: yarn install run: yarn install
- name: build js
run: yarn run tsc
- name: build docker - name: build docker
run: docker-compose build run: docker-compose build
- name: run jest - name: run jest

View file

@ -38,14 +38,18 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \
RUN yarn install --network-timeout 1000000 RUN yarn install --network-timeout 1000000
ADD *.js /app/ ADD tsconfig.json /app/
ADD util/*.js /app/util/ ADD src /app/src
RUN yarn run tsc
ADD config/ /app/ ADD config/ /app/
ADD html/ /app/html/ ADD html/ /app/html/
RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/bin/create-login-profile RUN chmod u+x /app/dist/main.js /app/dist/create-login-profile.js
RUN ln -s /app/dist/main.js /usr/bin/crawl; ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
WORKDIR /crawls WORKDIR /crawls

View file

@ -1,4 +0,0 @@
export default async ({data, page, crawler}) => {
await crawler.loadPage(page, data);
};

View file

@ -7,7 +7,8 @@
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software", "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
"license": "AGPL-3.0-or-later", "license": "AGPL-3.0-or-later",
"scripts": { "scripts": {
"lint": "eslint *.js util/*.js tests/*.test.js", "tsc": "tsc",
"lint": "eslint *.js tests/*.test.js",
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)", "test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
"prepare": "husky install" "prepare": "husky install"
}, },
@ -18,23 +19,31 @@
"crc": "^4.3.2", "crc": "^4.3.2",
"get-folder-size": "^4.0.0", "get-folder-size": "^4.0.0",
"husky": "^8.0.3", "husky": "^8.0.3",
"ioredis": "^4.27.1", "ioredis": "^5.3.2",
"js-yaml": "^4.1.0", "js-yaml": "^4.1.0",
"minio": "7.0.26", "minio": "^7.1.3",
"p-queue": "^7.3.4", "p-queue": "^7.3.4",
"puppeteer-core": "^20.7.4", "puppeteer-core": "^20.7.4",
"sharp": "^0.32.1", "sharp": "^0.32.1",
"sitemapper": "^3.2.5", "sitemapper": "^3.2.6",
"tsc": "^2.0.4",
"uuid": "8.3.2", "uuid": "8.3.2",
"warcio": "^2.2.0", "warcio": "^2.2.1",
"ws": "^7.4.4", "ws": "^7.4.4",
"yargs": "^17.7.2" "yargs": "^17.7.2"
}, },
"devDependencies": { "devDependencies": {
"eslint": "^8.37.0", "@types/js-yaml": "^4.0.8",
"@types/node": "^20.8.7",
"@types/uuid": "^9.0.6",
"@types/ws": "^8.5.8",
"@typescript-eslint/eslint-plugin": "^6.10.0",
"@typescript-eslint/parser": "^6.10.0",
"eslint": "^8.53.0",
"eslint-plugin-react": "^7.22.0", "eslint-plugin-react": "^7.22.0",
"jest": "^29.2.1", "jest": "^29.2.1",
"md5": "^2.3.0" "md5": "^2.3.0",
"typescript": "^5.2.2"
}, },
"jest": { "jest": {
"transform": {}, "transform": {},

File diff suppressed because it is too large Load diff

View file

@ -2,24 +2,25 @@
import fs from "fs"; import fs from "fs";
import path from "path"; import path from "path";
import http from "http"; import http, { IncomingMessage, ServerResponse } from "http";
import readline from "readline"; import readline from "readline";
import child_process from "child_process"; import child_process from "child_process";
import yargs from "yargs"; import yargs, { Options } from "yargs";
import { logger } from "./util/logger.js"; import { logger } from "./util/logger.js";
import { Browser } from "./util/browser.js"; import { Browser } from "./util/browser.js";
import { initStorage } from "./util/storage.js"; import { initStorage } from "./util/storage.js";
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
const profileHTML = fs.readFileSync(new URL("html/createProfile.html", import.meta.url), {encoding: "utf8"}); const profileHTML = fs.readFileSync(new URL("../html/createProfile.html", import.meta.url), {encoding: "utf8"});
const vncHTML = fs.readFileSync(new URL("html/vnc_lite.html", import.meta.url), {encoding: "utf8"}); const vncHTML = fs.readFileSync(new URL("../html/vnc_lite.html", import.meta.url), {encoding: "utf8"});
const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"}); const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
function cliOpts() { function cliOpts(): { [key: string]: Options } {
return { return {
"url": { "url": {
describe: "The URL of the login page", describe: "The URL of the login page",
@ -93,7 +94,7 @@ function cliOpts() {
} }
function getDefaultWindowSize() { function getDefaultWindowSize() {
const values = process.env.GEOMETRY.split("x"); const values = (process.env.GEOMETRY || "").split("x");
const x = Number(values[0]); const x = Number(values[0]);
const y = Number(values[1]); const y = Number(values[1]);
return `${x},${y}`; return `${x},${y}`;
@ -102,23 +103,23 @@ function getDefaultWindowSize() {
async function main() { async function main() {
const params = yargs(process.argv) // eslint-disable-next-line @typescript-eslint/no-explicit-any
const params : any = yargs(process.argv)
.usage("browsertrix-crawler profile [options]") .usage("browsertrix-crawler profile [options]")
.option(cliOpts()) .option(cliOpts())
.argv; .argv;
logger.setDebugLogging(true); logger.setDebugLogging(true);
if (!params.headless) { if (!params.headless) {
logger.debug("Launching XVFB"); logger.debug("Launching XVFB");
child_process.spawn("Xvfb", [ child_process.spawn("Xvfb", [
process.env.DISPLAY, process.env.DISPLAY || "",
"-listen", "-listen",
"tcp", "tcp",
"-screen", "-screen",
"0", "0",
process.env.GEOMETRY, process.env.GEOMETRY || "",
"-ac", "-ac",
"+extension", "+extension",
"RANDR" "RANDR"
@ -137,9 +138,9 @@ async function main() {
"-rfbport", "-rfbport",
"6080", "6080",
"-passwd", "-passwd",
process.env.VNC_PASS, process.env.VNC_PASS || "",
"-display", "-display",
process.env.DISPLAY process.env.DISPLAY || ""
]); ]);
} }
@ -178,7 +179,7 @@ async function main() {
const { page, cdp } = await browser.newWindowPageWithCDP(); const { page, cdp } = await browser.newWindowPageWithCDP();
const waitUntil = "load"; const waitUntil : PuppeteerLifeCycleEvent = "load";
await page.setCacheEnabled(false); await page.setCacheEnabled(false);
@ -203,7 +204,9 @@ async function main() {
} }
} }
async function automatedProfile(params, browser, page, cdp, waitUntil) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
async function automatedProfile(params: any, browser: Browser, page: Page, cdp: CDPSession,
waitUntil: PuppeteerLifeCycleEvent) {
let u, p; let u, p;
logger.debug("Looking for username and password entry fields on page..."); logger.debug("Looking for username and password entry fields on page...");
@ -222,12 +225,12 @@ async function automatedProfile(params, browser, page, cdp, waitUntil) {
return; return;
} }
await u.type(params.user); await u!.type(params.user);
await p.type(params.password); await p!.type(params.password);
await Promise.allSettled([ await Promise.allSettled([
p.press("Enter"), p!.press("Enter"),
page.waitForNavigation({waitUntil}) page.waitForNavigation({waitUntil})
]); ]);
@ -240,7 +243,8 @@ async function automatedProfile(params, browser, page, cdp, waitUntil) {
process.exit(0); process.exit(0);
} }
async function createProfile(params, browser, page, cdp, targetFilename = "") { // eslint-disable-next-line @typescript-eslint/no-explicit-any
async function createProfile(params: any, browser: Browser, page: Page, cdp: CDPSession, targetFilename = "") {
await cdp.send("Network.clearBrowserCache"); await cdp.send("Network.clearBrowserCache");
await browser.close(); await browser.close();
@ -268,8 +272,9 @@ async function createProfile(params, browser, page, cdp, targetFilename = "") {
return resource; return resource;
} }
function promptInput(msg, hidden = false) { function promptInput(msg: string, hidden = false) {
const rl = readline.createInterface({ // eslint-disable-next-line @typescript-eslint/no-explicit-any
const rl : any = readline.createInterface({
input: process.stdin, input: process.stdin,
output: process.stdout output: process.stdout
}); });
@ -290,8 +295,8 @@ function promptInput(msg, hidden = false) {
}); });
} }
return new Promise((resolve) => { return new Promise<string>((resolve) => {
rl.question(msg, function (res) { rl.question(msg, function (res: string) {
rl.close(); rl.close();
resolve(res); resolve(res);
}); });
@ -300,9 +305,31 @@ function promptInput(msg, hidden = false) {
class InteractiveBrowser { class InteractiveBrowser {
constructor(params, browser, page, cdp, targetId) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any;
browser: Browser;
page: Page;
cdp: CDPSession;
targetId: string;
originSet = new Set<string>();
shutdownWait: number;
shutdownTimer: NodeJS.Timer | null;
constructor(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any,
browser: Browser,
page: Page,
cdp: CDPSession,
targetId: string
) {
logger.info("Creating Profile Interactively..."); logger.info("Creating Profile Interactively...");
child_process.spawn("socat", ["tcp-listen:9222,reuseaddr,fork", "tcp:localhost:9221"]); child_process.spawn("socat", [
"tcp-listen:9222,reuseaddr,fork",
"tcp:localhost:9221",
]);
this.params = params; this.params = params;
this.browser = browser; this.browser = browser;
@ -311,8 +338,6 @@ class InteractiveBrowser {
this.targetId = targetId; this.targetId = targetId;
this.originSet = new Set();
this.addOrigin(); this.addOrigin();
page.on("load", () => this.handlePageLoad()); page.on("load", () => this.handlePageLoad());
@ -323,25 +348,31 @@ class InteractiveBrowser {
cdp.on("Page.windowOpen", async (resp) => { cdp.on("Page.windowOpen", async (resp) => {
if (resp.url) { if (resp.url) {
await cdp.send("Target.activateTarget", {targetId: this.targetId}); await cdp.send("Target.activateTarget", { targetId: this.targetId });
await page.goto(resp.url); await page.goto(resp.url);
} }
}); });
} }
this.shutdownWait = params.shutdownWait * 1000; this.shutdownWait = params.shutdownWait * 1000;
if (this.shutdownWait) { if (this.shutdownWait) {
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait); this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
logger.debug(`Shutting down in ${this.shutdownWait}ms if no ping received`); logger.debug(
`Shutting down in ${this.shutdownWait}ms if no ping received`
);
} else { } else {
this.shutdownTimer = 0; this.shutdownTimer = null;
} }
const httpServer = http.createServer((req, res) => this.handleRequest(req, res)); const httpServer = http.createServer((req, res) =>
this.handleRequest(req, res)
);
const port = 9223; const port = 9223;
httpServer.listen(port); httpServer.listen(port);
logger.info(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`); logger.info(
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`
);
if (!params.headless) { if (!params.headless) {
logger.info("Screencasting with VNC on port 6080"); logger.info("Screencasting with VNC on port 6080");
@ -363,18 +394,26 @@ class InteractiveBrowser {
} }
} }
async saveCookiesFor(url) { async saveCookiesFor(url: string) {
try { try {
if (this.params.cookieDays <= 0) { if (this.params.cookieDays <= 0) {
return; return;
} }
const cookies = await this.browser.getCookies(this.page, url); const cookies = await this.browser.getCookies(this.page);
for (const cookie of cookies) { for (const cookieOrig of cookies) {
cookie.expires = (new Date().getTime() / 1000) + this.params.cookieDays * 86400; // eslint-disable-next-line @typescript-eslint/no-explicit-any
const cookie = cookieOrig as any;
cookie.expires =
new Date().getTime() / 1000 + this.params.cookieDays * 86400;
delete cookie.size; delete cookie.size;
delete cookie.session; delete cookie.session;
if (cookie.sameSite && cookie.sameSite !== "Lax" && cookie.sameSite !== "Strict") { if (
cookie.sameSite &&
cookie.sameSite !== "Lax" &&
cookie.sameSite !== "Strict"
) {
delete cookie.sameSite; delete cookie.sameSite;
} }
if (!cookie.domain && !cookie.path) { if (!cookie.domain && !cookie.path) {
@ -382,64 +421,76 @@ class InteractiveBrowser {
} }
} }
await this.browser.setCookies(this.page, cookies); await this.browser.setCookies(this.page, cookies);
} catch (e) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Save Cookie Error: ", e); logger.error("Save Cookie Error: ", e);
} }
} }
addOrigin() { addOrigin() {
const url = this.page.url(); const url = this.page.url();
logger.debug("Adding origin", {url}); logger.debug("Adding origin", { url });
if (url.startsWith("http:") || url.startsWith("https:")) { if (url.startsWith("http:") || url.startsWith("https:")) {
this.originSet.add(new URL(url).origin); this.originSet.add(new URL(url).origin);
} }
} }
async handleRequest(req, res) { async handleRequest(req: IncomingMessage, res: ServerResponse) {
const parsedUrl = new URL(req.url, `http://${req.headers.host}`); const parsedUrl = new URL(req.url || "", `http://${req.headers.host}`);
const pathname = parsedUrl.pathname; const pathname = parsedUrl.pathname;
let targetUrl; let targetUrl;
let origins; let origins;
switch (pathname) { switch (pathname) {
case "/": case "/":
res.writeHead(200, {"Content-Type": "text/html"}); res.writeHead(200, { "Content-Type": "text/html" });
if (this.params.headless) { if (this.params.headless) {
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`; targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
} else { } else {
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`; targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
} }
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname))); res.end(
profileHTML.replace(
"$DEVTOOLS_SRC",
targetUrl.replaceAll("$HOST", parsedUrl.hostname)
)
);
return; return;
case "/vnc/": case "/vnc/":
case "/vnc/index.html": case "/vnc/index.html":
res.writeHead(200, {"Content-Type": "text/html"}); res.writeHead(200, { "Content-Type": "text/html" });
res.end(vncHTML); res.end(vncHTML);
return; return;
case "/ping": case "/ping":
if (this.shutdownWait) { if (this.shutdownWait) {
clearInterval(this.shutdownTimer); // eslint-disable-next-line @typescript-eslint/no-explicit-any
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait); clearTimeout(this.shutdownTimer as any);
logger.debug(`Ping received, delaying shutdown for ${this.shutdownWait}ms`); this.shutdownTimer = setTimeout(
() => process.exit(0),
this.shutdownWait
);
logger.debug(
`Ping received, delaying shutdown for ${this.shutdownWait}ms`
);
} }
origins = Array.from(this.originSet.values()); origins = Array.from(this.originSet.values());
res.writeHead(200, {"Content-Type": "application/json"}); res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({pong: true, origins})); res.end(JSON.stringify({ pong: true, origins }));
return; return;
case "/target": case "/target":
res.writeHead(200, {"Content-Type": "application/json"}); res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({targetId: this.targetId})); res.end(JSON.stringify({ targetId: this.targetId }));
return; return;
case "/vncpass": case "/vncpass":
res.writeHead(200, {"Content-Type": "application/json"}); res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({password: process.env.VNC_PASS})); res.end(JSON.stringify({ password: process.env.VNC_PASS }));
return; return;
case "/navigate": case "/navigate":
@ -451,14 +502,14 @@ class InteractiveBrowser {
const postData = await this.readBodyJson(req); const postData = await this.readBodyJson(req);
const url = new URL(postData.url).href; const url = new URL(postData.url).href;
res.writeHead(200, {"Content-Type": "application/json"}); res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({success: true})); res.end(JSON.stringify({ success: true }));
this.page.goto(url); this.page.goto(url);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e) { } catch (e: any) {
res.writeHead(400, {"Content-Type": "application/json"}); res.writeHead(400, { "Content-Type": "application/json" });
res.end(JSON.stringify({"error": e.toString()})); res.end(JSON.stringify({ error: e.toString() }));
logger.warn("HTTP Error", e); logger.warn("HTTP Error", e);
} }
return; return;
@ -474,14 +525,21 @@ class InteractiveBrowser {
await this.saveAllCookies(); await this.saveAllCookies();
const resource = await createProfile(this.params, this.browser, this.page, this.cdp, targetFilename); const resource = await createProfile(
this.params,
this.browser,
this.page,
this.cdp,
targetFilename
);
origins = Array.from(this.originSet.values()); origins = Array.from(this.originSet.values());
res.writeHead(200, {"Content-Type": "application/json"}); res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({resource, origins})); res.end(JSON.stringify({ resource, origins }));
} catch (e) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
res.writeHead(500, {"Content-Type": "application/json"}); } catch (e: any) {
res.end(JSON.stringify({"error": e.toString()})); res.writeHead(500, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: e.toString() }));
logger.warn("HTTP Error", e); logger.warn("HTTP Error", e);
} }
@ -498,11 +556,16 @@ class InteractiveBrowser {
await createProfile(this.params, this.browser, this.page, this.cdp); await createProfile(this.params, this.browser, this.page, this.cdp);
res.writeHead(200, {"Content-Type": "text/html"}); res.writeHead(200, { "Content-Type": "text/html" });
res.end("<html><body>Profile Created! You may now close this window.</body></html>"); res.end(
} catch (e) { "<html><body>Profile Created! You may now close this window.</body></html>"
res.writeHead(500, {"Content-Type": "text/html"}); );
res.end("<html><body>Profile creation failed! See the browsertrix-crawler console for more info"); // eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
res.writeHead(500, { "Content-Type": "text/html" });
res.end(
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info"
);
logger.warn("HTTP Error", e); logger.warn("HTTP Error", e);
} }
@ -511,18 +574,21 @@ class InteractiveBrowser {
} }
if (pathname.startsWith("/vnc/")) { if (pathname.startsWith("/vnc/")) {
const fileUrl = new URL("node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length), import.meta.url); const fileUrl = new URL(
const file = fs.readFileSync(fileUrl, {encoding: "utf-8"}); "../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length),
res.writeHead(200, {"Content-Type": "application/javascript"}); import.meta.url
);
const file = fs.readFileSync(fileUrl, { encoding: "utf-8" });
res.writeHead(200, { "Content-Type": "application/javascript" });
res.end(file); res.end(file);
return; return;
} }
res.writeHead(404, {"Content-Type": "text/html"}); res.writeHead(404, { "Content-Type": "text/html" });
res.end("Not Found"); res.end("Not Found");
} }
async readBodyJson(req) { async readBodyJson(req: IncomingMessage) {
const buffers = []; const buffers = [];
for await (const chunk of req) { for await (const chunk of req) {

7
src/defaultDriver.ts Normal file
View file

@ -0,0 +1,7 @@
import { Page } from "puppeteer-core";
import { PageState } from "./util/state.js";
import { Crawler } from "./crawler.js";
export default async ({data, page, crawler} : {data: PageState, page: Page, crawler: Crawler}) => {
await crawler.loadPage(page, data);
};

View file

@ -5,13 +5,13 @@ import { setExitOnRedisError } from "./util/redis.js";
import { Crawler } from "./crawler.js"; import { Crawler } from "./crawler.js";
var crawler = null; let crawler : Crawler | null = null;
var lastSigInt = 0; let lastSigInt = 0;
let forceTerm = false; let forceTerm = false;
async function handleTerminate(signame) { async function handleTerminate(signame: string) {
logger.info(`${signame} received...`); logger.info(`${signame} received...`);
if (!crawler || !crawler.crawlState) { if (!crawler || !crawler.crawlState) {
logger.error("error: no crawler running, exiting"); logger.error("error: no crawler running, exiting");
@ -23,7 +23,7 @@ async function handleTerminate(signame) {
process.exit(0); process.exit(0);
} }
setExitOnRedisError(true); setExitOnRedisError();
try { try {
crawler.checkCanceled(); crawler.checkCanceled();
@ -31,13 +31,13 @@ async function handleTerminate(signame) {
if (!crawler.interrupted) { if (!crawler.interrupted) {
logger.info("SIGNAL: gracefully finishing current pages..."); logger.info("SIGNAL: gracefully finishing current pages...");
crawler.gracefulFinishOnInterrupt(); crawler.gracefulFinishOnInterrupt();
} else if (forceTerm || Date.now() - lastSigInt > 200) {
} else if (forceTerm || (Date.now() - lastSigInt) > 200) {
logger.info("SIGNAL: stopping crawl now..."); logger.info("SIGNAL: stopping crawl now...");
await crawler.serializeAndExit(); await crawler.serializeAndExit();
} }
lastSigInt = Date.now(); lastSigInt = Date.now();
} catch (e) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Error stopping crawl after receiving termination signal", e); logger.error("Error stopping crawl after receiving termination signal", e);
} }
} }

View file

@ -4,7 +4,7 @@ import os from "os";
import yaml from "js-yaml"; import yaml from "js-yaml";
import { KnownDevices as devices } from "puppeteer-core"; import { KnownDevices as devices } from "puppeteer-core";
import yargs from "yargs"; import yargs, { Options } from "yargs";
import { hideBin } from "yargs/helpers"; import { hideBin } from "yargs/helpers";
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js"; import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
@ -16,8 +16,8 @@ import { logger } from "./logger.js";
// ============================================================================ // ============================================================================
class ArgParser { class ArgParser {
get cliOpts() { get cliOpts() : { [key: string]: Options } {
const coerce = array => { const coerce = (array : string[]) => {
return array.flatMap(v => v.split(",")).filter(x => !!x); return array.flatMap(v => v.split(",")).filter(x => !!x);
}; };
@ -305,7 +305,7 @@ class ArgParser {
"warcInfo": { "warcInfo": {
alias: ["warcinfo"], alias: ["warcinfo"],
describe: "Optional fields added to the warcinfo record in combined WARCs", describe: "Optional fields added to the warcinfo record in combined WARCs",
type: "object" //type: "object"
}, },
"redisStoreUrl": { "redisStoreUrl": {
@ -423,7 +423,7 @@ class ArgParser {
"customBehaviors": { "customBehaviors": {
describe: "injects a custom behavior file or set of behavior files in a directory", describe: "injects a custom behavior file or set of behavior files in a directory",
type: ["string"] type: "string"
}, },
"debugAccessRedis": { "debugAccessRedis": {
@ -433,8 +433,8 @@ class ArgParser {
}; };
} }
parseArgs(argv) { parseArgs(argvParams?: string[]) {
argv = argv || process.argv; let argv = argvParams || process.argv;
if (process.env.CRAWL_ARGS) { if (process.env.CRAWL_ARGS) {
argv = argv.concat(this.splitCrawlArgsQuoteSafe(process.env.CRAWL_ARGS)); argv = argv.concat(this.splitCrawlArgsQuoteSafe(process.env.CRAWL_ARGS));
@ -445,11 +445,12 @@ class ArgParser {
const parsed = yargs(hideBin(argv)) const parsed = yargs(hideBin(argv))
.usage("crawler [options]") .usage("crawler [options]")
.option(this.cliOpts) .option(this.cliOpts)
.config("config", "Path to YAML config file", (configPath) => { .config("config", "Path to YAML config file", (configPath : string | number) => {
if (configPath === "/crawls/stdin") { if (configPath === "/crawls/stdin") {
configPath = process.stdin.fd; configPath = process.stdin.fd;
} }
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")); // eslint-disable-next-line @typescript-eslint/no-explicit-any
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any;
return origConfig; return origConfig;
}) })
.check((argv) => this.validateArgs(argv)) .check((argv) => this.validateArgs(argv))
@ -458,13 +459,15 @@ class ArgParser {
return {parsed, origConfig}; return {parsed, origConfig};
} }
splitCrawlArgsQuoteSafe(crawlArgs) { splitCrawlArgsQuoteSafe(crawlArgs: string) : string[] {
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes // Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
const regex = /"[^"]+"|[^\s]+/g; const regex = /"[^"]+"|[^\s]+/g;
return crawlArgs.match(regex).map(e => e.replace(/"(.+)"/, "$1")); const res = crawlArgs.match(regex);
return res ? res.map(e => e.replace(/"(.+)"/, "$1")) : [];
} }
validateArgs(argv) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
validateArgs(argv: Record<string, any>) {
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname; argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
argv.collection = interpolateFilename(argv.collection, argv.crawlId); argv.collection = interpolateFilename(argv.collection, argv.crawlId);
@ -474,15 +477,16 @@ class ArgParser {
} }
// background behaviors to apply // background behaviors to apply
const behaviorOpts = {}; const behaviorOpts : {[key: string]: string | boolean} = {};
argv.behaviors.forEach((x) => behaviorOpts[x] = true); argv.behaviors.forEach((x: string) => behaviorOpts[x] = true);
behaviorOpts.log = BEHAVIOR_LOG_FUNC; behaviorOpts.log = BEHAVIOR_LOG_FUNC;
argv.behaviorOpts = JSON.stringify(behaviorOpts); argv.behaviorOpts = JSON.stringify(behaviorOpts);
argv.text = argv.text || []; argv.text = argv.text || [];
if (argv.mobileDevice) { if (argv.mobileDevice) {
argv.emulateDevice = devices[argv.mobileDevice.replace("-", " ")]; // eslint-disable-next-line @typescript-eslint/no-explicit-any
argv.emulateDevice = (devices as Record<string, any>)[argv.mobileDevice.replace("-", " ")];
if (!argv.emulateDevice) { if (!argv.emulateDevice) {
logger.fatal("Unknown device: " + argv.mobileDevice); logger.fatal("Unknown device: " + argv.mobileDevice);
} }
@ -556,6 +560,6 @@ class ArgParser {
} }
} }
export function parseArgs(argv) { export function parseArgs(argv?: string[]) {
return new ArgParser().parseArgs(argv); return new ArgParser().parseArgs(argv);
} }

View file

@ -1,6 +1,8 @@
import fs from "fs"; import fs from "fs";
import { logger, errJSON } from "./logger.js"; import { logger, errJSON } from "./logger.js";
import { HTTPRequest, Page } from "puppeteer-core";
import { Browser } from "./browser.js";
const RULE_TYPES = ["block", "allowOnly"]; const RULE_TYPES = ["block", "allowOnly"];
@ -14,11 +16,23 @@ const BlockState = {
BLOCK_AD: "advertisement" BLOCK_AD: "advertisement"
}; };
type BlockRuleDecl = {
url?: string;
frameTextMatch?: string;
inFrameUrl?: string;
type?: string;
}
// =========================================================================== // ===========================================================================
class BlockRule class BlockRule
{ {
constructor(data) { type: string;
url: RegExp | null;
frameTextMatch?: RegExp | null;
inFrameUrl?: RegExp | null;
constructor(data: string | BlockRuleDecl) {
if (typeof(data) === "string") { if (typeof(data) === "string") {
this.url = new RegExp(data); this.url = new RegExp(data);
this.type = "block"; this.type = "block";
@ -49,7 +63,12 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
// =========================================================================== // ===========================================================================
export class BlockRules export class BlockRules
{ {
constructor(blockRules, blockPutUrl, blockErrMsg) { rules: BlockRule[];
blockPutUrl: string;
blockErrMsg: string;
blockedUrlSet = new Set();
constructor(blockRules: BlockRuleDecl[], blockPutUrl: string, blockErrMsg: string) {
this.rules = []; this.rules = [];
this.blockPutUrl = blockPutUrl; this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg; this.blockErrMsg = blockErrMsg;
@ -68,8 +87,8 @@ export class BlockRules
} }
} }
async initPage(browser, page) { async initPage(browser: Browser, page: Page) {
const onRequest = async (request) => { const onRequest = async (request: HTTPRequest) => {
const logDetails = {page: page.url()}; const logDetails = {page: page.url()};
try { try {
await this.handleRequest(request, logDetails); await this.handleRequest(request, logDetails);
@ -80,7 +99,8 @@ export class BlockRules
await browser.interceptRequest(page, onRequest); await browser.interceptRequest(page, onRequest);
} }
async handleRequest(request, logDetails) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
async handleRequest(request: HTTPRequest, logDetails: Record<string, any>) {
const url = request.url(); const url = request.url();
let blockState; let blockState;
@ -99,7 +119,8 @@ export class BlockRules
} }
} }
async shouldBlock(request, url, logDetails) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
if (!url.startsWith("http:") && !url.startsWith("https:")) { if (!url.startsWith("http:") && !url.startsWith("https:")) {
return BlockState.ALLOW; return BlockState.ALLOW;
} }
@ -107,6 +128,9 @@ export class BlockRules
const isNavReq = request.isNavigationRequest(); const isNavReq = request.isNavigationRequest();
const frame = request.frame(); const frame = request.frame();
if (!frame) {
return BlockState.ALLOW;
}
let frameUrl = ""; let frameUrl = "";
let blockState; let blockState;
@ -157,7 +181,8 @@ export class BlockRules
return BlockState.ALLOW; return BlockState.ALLOW;
} }
async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq, logDetails) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
async ruleCheck(rule: BlockRule, request: HTTPRequest, reqUrl: string, frameUrl: string, isNavReq: boolean, logDetails: Record<string, any>) {
const {url, inFrameUrl, frameTextMatch} = rule; const {url, inFrameUrl, frameTextMatch} = rule;
const type = rule.type || "block"; const type = rule.type || "block";
@ -187,7 +212,8 @@ export class BlockRules
return {block, done: false}; return {block, done: false};
} }
async isTextMatch(request, reqUrl, frameTextMatch, logDetails) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
async isTextMatch(request: HTTPRequest, reqUrl: string, frameTextMatch: RegExp, logDetails: Record<string, any>) {
try { try {
const res = await fetch(reqUrl); const res = await fetch(reqUrl);
const text = await res.text(); const text = await res.text();
@ -199,7 +225,7 @@ export class BlockRules
} }
} }
async recordBlockMsg(url) { async recordBlockMsg(url: string) {
if (this.blockedUrlSet.has(url)) { if (this.blockedUrlSet.has(url)) {
return; return;
} }
@ -221,18 +247,21 @@ export class BlockRules
// =========================================================================== // ===========================================================================
export class AdBlockRules extends BlockRules export class AdBlockRules extends BlockRules
{ {
constructor(blockPutUrl, blockErrMsg, adhostsFilePath = "../ad-hosts.json") { adhosts: string[];
constructor(blockPutUrl: string, blockErrMsg: string, adhostsFilePath = "../../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg); super([], blockPutUrl, blockErrMsg);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url))); this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {"encoding": "utf-8"}));
} }
isAdUrl(url) { isAdUrl(url: string) {
const fragments = url.split("/"); const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null; const domain = fragments.length > 2 ? fragments[2] : null;
return this.adhosts.includes(domain); return domain && this.adhosts.includes(domain);
} }
async shouldBlock(request, url, logDetails) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
if (this.isAdUrl(url)) { if (this.isAdUrl(url)) {
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking"); logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
await this.recordBlockMsg(url); await this.recordBlockMsg(url);

View file

@ -9,61 +9,85 @@ import path from "path";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { initStorage } from "./storage.js"; import { initStorage } from "./storage.js";
import puppeteer from "puppeteer-core"; import puppeteer, { Frame, HTTPRequest, Page, PuppeteerLaunchOptions, Viewport } from "puppeteer-core";
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
type LaunchOpts = {
profileUrl: string;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
chromeOptions: Record<string, any>
signals: boolean;
headless: boolean;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
emulateDevice?: Record<string, any>
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
ondisconnect?: ((err: any) => NonNullable<unknown>) | null
};
// ================================================================== // ==================================================================
export class BaseBrowser export class Browser
{ {
profileDir: string;
customProfile = false;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
emulateDevice: Record<string, any> | null = null;
browser?: PptrBrowser | null = null;
firstCDP: CDPSession | null = null;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
recorders: any[] = [];
constructor() { constructor() {
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-")); this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
this.customProfile = false;
this.emulateDevice = null;
this.recorders = [];
} }
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null} = {}) { async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null} : LaunchOpts) { if (this.isLaunched()) {
if (this.isLaunched()) { return;
return;
}
if (profileUrl) {
this.customProfile = await this.loadProfile(profileUrl);
}
this.emulateDevice = emulateDevice;
const args = this.chromeArgs(chromeOptions);
let defaultViewport = null;
if (process.env.GEOMETRY) {
const geom = process.env.GEOMETRY.split("x");
defaultViewport = {width: Number(geom[0]), height: Number(geom[1])};
}
const launchOpts = {
args,
headless: headless ? "new" : false,
executablePath: this.getBrowserExe(),
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
ignoreHTTPSErrors: true,
handleSIGHUP: signals,
handleSIGINT: signals,
handleSIGTERM: signals,
protocolTimeout: 0,
defaultViewport,
waitForInitialPage: false,
userDataDir: this.profileDir
};
await this._init(launchOpts, ondisconnect);
} }
async setupPage({page}) { if (profileUrl) {
this.customProfile = await this.loadProfile(profileUrl);
}
this.emulateDevice = emulateDevice;
const args = this.chromeArgs(chromeOptions);
let defaultViewport = null;
if (process.env.GEOMETRY) {
const geom = process.env.GEOMETRY.split("x");
defaultViewport = {width: Number(geom[0]), height: Number(geom[1])};
}
const launchOpts : PuppeteerLaunchOptions = {
args,
headless: headless ? "new" : false,
executablePath: this.getBrowserExe(),
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
ignoreHTTPSErrors: true,
handleSIGHUP: signals,
handleSIGINT: signals,
handleSIGTERM: signals,
protocolTimeout: 0,
defaultViewport,
waitForInitialPage: false,
userDataDir: this.profileDir
};
await this._init(launchOpts, ondisconnect);
}
async setupPage({page} : {page: Page, cdp: CDPSession}) {
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});"); await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});");
if (this.customProfile) { if (this.customProfile) {
@ -73,7 +97,7 @@ export class BaseBrowser
} }
} }
async loadProfile(profileFilename) { async loadProfile(profileFilename: string) : Promise<boolean> {
const targetFilename = "/tmp/profile.tar.gz"; const targetFilename = "/tmp/profile.tar.gz";
if (profileFilename && if (profileFilename &&
@ -83,16 +107,19 @@ export class BaseBrowser
const resp = await fetch(profileFilename); const resp = await fetch(profileFilename);
await pipeline( await pipeline(
Readable.fromWeb(resp.body), // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Readable.fromWeb(resp.body as any),
fs.createWriteStream(targetFilename) fs.createWriteStream(targetFilename)
); );
profileFilename = targetFilename; profileFilename = targetFilename;
} else if (profileFilename && profileFilename.startsWith("@")) { } else if (profileFilename && profileFilename.startsWith("@")) {
const storage = initStorage(""); const storage = initStorage();
if (!storage) { if (!storage) {
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined"); logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
return false;
} }
await storage.downloadFile(profileFilename.slice(1), targetFilename); await storage.downloadFile(profileFilename.slice(1), targetFilename);
@ -112,7 +139,7 @@ export class BaseBrowser
return false; return false;
} }
saveProfile(profileFilename) { saveProfile(profileFilename: string) {
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir}); child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
} }
@ -142,11 +169,17 @@ export class BaseBrowser
} }
getDefaultUA() { getDefaultUA() {
let version = process.env.BROWSER_VERSION; let version : string | undefined = process.env.BROWSER_VERSION;
try { try {
version = child_process.execFileSync(this.getBrowserExe(), ["--version"], {encoding: "utf8"}); const browser = this.getBrowserExe();
version = version.match(/[\d.]+/)[0]; if (browser) {
version = child_process.execFileSync(browser, ["--version"], {encoding: "utf8"});
const match = version && version.match(/[\d.]+/);
if (match) {
version = match[0];
}
}
} catch(e) { } catch(e) {
console.error(e); console.error(e);
} }
@ -161,13 +194,13 @@ export class BaseBrowser
return file; return file;
} }
} }
return null;
} }
async evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName) { async evaluateWithCLI_(cdp: CDPSession, frame: Frame, cdpContextId: number, funcString: string, logData: Record<string, string>, contextName: string) {
const frameUrl = frame.url(); const frameUrl = frame.url();
let details = {frameUrl, ...logData}; // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let details : Record<string, any> = {frameUrl, ...logData};
if (!frameUrl || frame.isDetached()) { if (!frameUrl || frame.isDetached()) {
logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName); logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName);
@ -201,18 +234,6 @@ export class BaseBrowser
return result.value; return result.value;
} }
}
// ==================================================================
export class Browser extends BaseBrowser
{
constructor() {
super();
this.browser = null;
this.firstCDP = null;
}
isLaunched() { isLaunched() {
if (this.browser) { if (this.browser) {
@ -231,11 +252,12 @@ export class Browser extends BaseBrowser
} }
} }
addInitScript(page, script) { addInitScript(page: Page, script: string) {
return page.evaluateOnNewDocument(script); return page.evaluateOnNewDocument(script);
} }
async _init(launchOpts, ondisconnect = null) { // eslint-disable-next-line @typescript-eslint/ban-types
async _init(launchOpts: PuppeteerLaunchOptions, ondisconnect : Function | null = null) {
this.browser = await puppeteer.launch(launchOpts); this.browser = await puppeteer.launch(launchOpts);
const target = this.browser.target(); const target = this.browser.target();
@ -252,21 +274,29 @@ export class Browser extends BaseBrowser
}); });
} }
async newWindowPageWithCDP() { async newWindowPageWithCDP() : Promise<{cdp: CDPSession, page: Page}> {
// unique url to detect new pages // unique url to detect new pages
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2); const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
const p = new Promise((resolve) => { const p = new Promise<Target>((resolve) => {
const listener = (target) => { const listener = (target: Target) => {
if (target.url() === startPage) { if (target.url() === startPage) {
resolve(target); resolve(target);
this.browser.removeListener("targetcreated", listener); if (this.browser) {
this.browser.removeListener("targetcreated", listener);
}
} }
}; };
this.browser.on("targetcreated", listener); if (this.browser) {
this.browser.on("targetcreated", listener);
}
}); });
if (!this.firstCDP) {
throw new Error("CDP missing");
}
try { try {
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true}); await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
} catch (e) { } catch (e) {
@ -283,12 +313,17 @@ export class Browser extends BaseBrowser
const target = await p; const target = await p;
const page = await target.page(); const page = await target.page();
if (!page) {
throw new Error("page missing");
}
const device = this.emulateDevice; const device = this.emulateDevice;
if (device) { if (device && page) {
if (device.viewport && device.userAgent) { if (device.viewport && device.userAgent) {
await page.emulate(device); // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
await page.emulate(device as any);
} else if (device.userAgent) { } else if (device.userAgent) {
await page.setUserAgent(device.userAgent); await page.setUserAgent(device.userAgent);
} }
@ -300,9 +335,17 @@ export class Browser extends BaseBrowser
} }
async serviceWorkerFetch() { async serviceWorkerFetch() {
if (!this.firstCDP) {
return;
}
this.firstCDP.on("Fetch.requestPaused", async (params) => { this.firstCDP.on("Fetch.requestPaused", async (params) => {
const { frameId, requestId, networkId, request } = params; const { frameId, requestId, networkId, request } = params;
if (!this.firstCDP) {
throw new Error("CDP missing");
}
if (networkId) { if (networkId) {
try { try {
await this.firstCDP.send("Fetch.continueResponse", {requestId}); await this.firstCDP.send("Fetch.continueResponse", {requestId});
@ -343,30 +386,44 @@ export class Browser extends BaseBrowser
await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]}); await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]});
} }
async evaluateWithCLI(_, frame, cdp, funcString, logData, contextName) { // TODO: Fix this the next time the file is edited.
const context = await frame.executionContext();
async evaluateWithCLI(
_: unknown,
frame: Frame,
cdp: CDPSession,
funcString: string,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
logData: Record<string, any>,
contextName: string
) {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const context = await (frame as any).executionContext();
cdp = context._client; cdp = context._client;
const cdpContextId = context._contextId; const cdpContextId = context._contextId;
return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName); return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName);
} }
interceptRequest(page, callback) { interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
page.on("request", callback); page.on("request", callback);
} }
async waitForNetworkIdle(page, params) { async waitForNetworkIdle(page: Page, params: {timeout?: number}) {
return await page.waitForNetworkIdle(params); return await page.waitForNetworkIdle(params);
} }
async setViewport(page, params) { async setViewport(page: Page, params: Viewport) {
await page.setViewport(params); await page.setViewport(params);
} }
async getCookies(page) { async getCookies(page: Page) {
return await page.cookies(); return await page.cookies();
} }
async setCookies(page, cookies) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
async setCookies(page: Page, cookies: any) {
return await page.setCookie(...cookies); return await page.setCookie(...cookies);
} }
} }

View file

@ -3,7 +3,7 @@ import path from "path";
const MAX_DEPTH = 2; const MAX_DEPTH = 2;
export function collectAllFileSources(fileOrDir, ext = null, depth = 0) { export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0) : string[] {
const resolvedPath = path.resolve(fileOrDir); const resolvedPath = path.resolve(fileOrDir);
if (depth >= MAX_DEPTH) { if (depth >= MAX_DEPTH) {
@ -13,14 +13,14 @@ export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
const stat = fs.statSync(resolvedPath); const stat = fs.statSync(resolvedPath);
if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) { if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath); const contents = fs.readFileSync(resolvedPath);
return [`/* src: ${resolvedPath} */\n\n${contents}`]; return [`/* src: ${resolvedPath} */\n\n${contents}`];
} }
if (stat.isDirectory) { if (stat.isDirectory()) {
const files = fs.readdirSync(resolvedPath); const files = fs.readdirSync(resolvedPath);
return files.reduce((acc, next) => { return files.reduce((acc: string[], next: string) => {
const nextPath = path.join(fileOrDir, next); const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)]; return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
}, []); }, []);
@ -28,6 +28,7 @@ export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
if (depth === 0) { if (depth === 0) {
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`); console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
return [];
} }
return [];
} }

View file

@ -6,9 +6,14 @@ import { logger } from "./logger.js";
// =========================================================================== // ===========================================================================
export class HealthChecker export class HealthChecker
{ {
constructor(port, errorThreshold) { port: number;
errorThreshold: number;
healthServer: http.Server;
errorCount = 0;
constructor(port: number, errorThreshold: number) {
this.port = port; this.port = port;
this.errorCount = 0;
this.errorThreshold = errorThreshold; this.errorThreshold = errorThreshold;
this.healthServer = http.createServer((...args) => this.healthCheck(...args)); this.healthServer = http.createServer((...args) => this.healthCheck(...args));
@ -16,8 +21,8 @@ export class HealthChecker
this.healthServer.listen(port); this.healthServer.listen(port);
} }
async healthCheck(req, res) { async healthCheck(req: http.IncomingMessage, res: http.ServerResponse) {
const pathname = url.parse(req.url).pathname; const pathname = req.url ? url.parse(req.url).pathname : "";
switch (pathname) { switch (pathname) {
case "/healthz": case "/healthz":
if (this.errorCount < this.errorThreshold) { if (this.errorCount < this.errorThreshold) {

View file

@ -1,57 +1,73 @@
// =========================================================================== // ===========================================================================
// to fix serialization of regexes for logging purposes // to fix serialization of regexes for logging purposes
RegExp.prototype.toJSON = RegExp.prototype.toString;
import { Writable } from "node:stream";
import { RedisCrawlState } from "./state.js";
// RegExp.prototype.toJSON = RegExp.prototype.toString;
Object.defineProperty(RegExp.prototype, "toJSON", { value: RegExp.prototype.toString });
// =========================================================================== // ===========================================================================
export function errJSON(e) { // TODO: Fix this the next time the file is edited.
return {"type": "exception", "message": e.message, "stack": e.stack}; // eslint-disable-next-line @typescript-eslint/no-explicit-any
export function errJSON(e: any) {
if (e instanceof Error) {
return {"type": "exception", "message": e.message, "stack": e.stack};
} else {
return {"message": e.toString()};
}
} }
// =========================================================================== // ===========================================================================
class Logger class Logger
{ {
constructor() { logStream : Writable | null = null;
this.logStream = null; debugLogging = false;
this.debugLogging = null; logErrorsToRedis = false;
this.logErrorsToRedis = false; logLevels : string[] = [];
this.logLevels = []; contexts : string[] = [];
this.contexts = []; crawlState? : RedisCrawlState | null = null;
this.crawlState = null; fatalExitCode = 17;
this.fatalExitCode = 17; setDefaultFatalExitCode(exitCode: number) {
}
setDefaultFatalExitCode(exitCode) {
this.fatalExitCode = exitCode; this.fatalExitCode = exitCode;
} }
setExternalLogStream(logFH) { setExternalLogStream(logFH: Writable | null) {
this.logStream = logFH; this.logStream = logFH;
} }
setDebugLogging(debugLog) { setDebugLogging(debugLog: boolean) {
this.debugLogging = debugLog; this.debugLogging = debugLog;
} }
setLogErrorsToRedis(logErrorsToRedis) { setLogErrorsToRedis(logErrorsToRedis: boolean) {
this.logErrorsToRedis = logErrorsToRedis; this.logErrorsToRedis = logErrorsToRedis;
} }
setLogLevel(logLevels) { setLogLevel(logLevels: string[]) {
this.logLevels = logLevels; this.logLevels = logLevels;
} }
setContext(contexts) { setContext(contexts: string[]) {
this.contexts = contexts; this.contexts = contexts;
} }
setCrawlState(crawlState) { setCrawlState(crawlState: RedisCrawlState) {
this.crawlState = crawlState; this.crawlState = crawlState;
} }
logAsJSON(message, data, context, logLevel="info") { // TODO: Fix this the next time the file is edited.
logAsJSON(
message: string,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
data: Record<string, string> | Error | any,
context: string,
logLevel="info"
) {
if (data instanceof Error) { if (data instanceof Error) {
data = errJSON(data); data = errJSON(data);
} else if (typeof data !== "object") { } else if (typeof data !== "object") {
@ -70,7 +86,7 @@ class Logger
} }
} }
let dataToLog = { const dataToLog = {
"timestamp": new Date().toISOString(), "timestamp": new Date().toISOString(),
"logLevel": logLevel, "logLevel": logLevel,
"context": context, "context": context,
@ -84,30 +100,30 @@ class Logger
} }
const toLogToRedis = ["error", "fatal"]; const toLogToRedis = ["error", "fatal"];
if (this.logErrorsToRedis && toLogToRedis.includes(logLevel)) { if (this.logErrorsToRedis && this.crawlState && toLogToRedis.includes(logLevel)) {
this.crawlState.logError(string); this.crawlState.logError(string);
} }
} }
info(message, data={}, context="general") { info(message: string, data={}, context="general") {
this.logAsJSON(message, data, context); this.logAsJSON(message, data, context);
} }
error(message, data={}, context="general") { error(message: string, data={}, context="general") {
this.logAsJSON(message, data, context, "error"); this.logAsJSON(message, data, context, "error");
} }
warn(message, data={}, context="general") { warn(message: string, data={}, context="general") {
this.logAsJSON(message, data, context, "warn"); this.logAsJSON(message, data, context, "warn");
} }
debug(message, data={}, context="general") { debug(message: string, data={}, context="general") {
if (this.debugLogging) { if (this.debugLogging) {
this.logAsJSON(message, data, context, "debug"); this.logAsJSON(message, data, context, "debug");
} }
} }
fatal(message, data={}, context="general", exitCode=0) { fatal(message: string, data={}, context="general", exitCode=0) {
exitCode = exitCode || this.fatalExitCode; exitCode = exitCode || this.fatalExitCode;
this.logAsJSON(`${message}. Quitting`, data, context, "fatal"); this.logAsJSON(`${message}. Quitting`, data, context, "fatal");

View file

@ -1,10 +1,14 @@
import { HTTPRequest, Page } from "puppeteer-core";
import { errJSON, logger } from "./logger.js"; import { errJSON, logger } from "./logger.js";
import { Browser } from "./browser.js";
export class OriginOverride export class OriginOverride
{ {
constructor(originOverride) { originOverride: {origUrl: URL, destUrl: URL}[];
constructor(originOverride: string[]) {
this.originOverride = originOverride.map((override) => { this.originOverride = originOverride.map((override) => {
let [orig, dest] = override.split("="); const [orig, dest] = override.split("=");
const origUrl = new URL(orig); const origUrl = new URL(orig);
const destUrl = new URL(dest); const destUrl = new URL(dest);
@ -12,8 +16,8 @@ export class OriginOverride
}); });
} }
async initPage(browser, page) { async initPage(browser: Browser, page: Page) {
const onRequest = async (request) => { const onRequest = async (request: HTTPRequest) => {
try { try {
const url = request.url(); const url = request.url();
@ -28,12 +32,13 @@ export class OriginOverride
} }
} }
if (!newUrl) { if (!newUrl || !orig) {
request.continue({}, -1); request.continue({}, -1);
return; return;
} }
const headers = new Headers(request.headers()); const headers = new Headers(request.headers());
headers.set("host", orig.host); headers.set("host", orig.host);
if (headers.get("origin")) { if (headers.get("origin")) {
headers.set("origin", orig.origin); headers.set("origin", orig.origin);

View file

@ -10,12 +10,16 @@ import { logger, errJSON } from "./logger.js";
import { sleep, timestampNow } from "./timing.js"; import { sleep, timestampNow } from "./timing.js";
import { RequestResponseInfo } from "./reqresp.js"; import { RequestResponseInfo } from "./reqresp.js";
// @ts-expect-error TODO fill in why error is expected
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
// @ts-expect-error TODO fill in why error is expected
import { rewriteDASH, rewriteHLS } from "@webrecorder/wabac/src/rewrite/rewriteVideo.js"; import { rewriteDASH, rewriteHLS } from "@webrecorder/wabac/src/rewrite/rewriteVideo.js";
import { WARCRecord } from "warcio"; import { WARCRecord } from "warcio";
import { WARCSerializer } from "warcio/node"; import { TempFileBuffer, WARCSerializer } from "warcio/node";
import { WARCWriter } from "./warcwriter.js"; import { WARCWriter } from "./warcwriter.js";
import { RedisCrawlState, WorkerId } from "./state.js";
import { CDPSession, Protocol } from "puppeteer-core";
const MAX_BROWSER_FETCH_SIZE = 2_000_000; const MAX_BROWSER_FETCH_SIZE = 2_000_000;
const MAX_NETWORK_LOAD_SIZE = 200_000_000; const MAX_NETWORK_LOAD_SIZE = 200_000_000;
@ -26,15 +30,58 @@ const WRITE_DUPE_KEY = "s:writedupe";
const encoder = new TextEncoder(); const encoder = new TextEncoder();
// ================================================================= // =================================================================
function logNetwork(/*msg, data*/) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars
function logNetwork(msg: string, data: any) {
// logger.debug(msg, data, "recorderNetwork"); // logger.debug(msg, data, "recorderNetwork");
} }
// ================================================================= // =================================================================
export class Recorder export class Recorder
{ {
constructor({workerid, collDir, crawler}) { workerid: WorkerId;
collDir: string;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
crawler: any;
crawlState: RedisCrawlState;
warcQ: PQueue;
fetcherQ: PQueue;
pendingRequests!: Map<string, RequestResponseInfo>;
skipIds!: Set<string>;
swSessionId?: string | null;
swFrameIds = new Set<string>();
swUrls = new Set<string>();
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
logDetails: Record<string, any> = {};
skipping = false;
allowFull206 = false;
archivesDir: string;
tempdir: string;
tempCdxDir: string;
gzip = true;
writer: WARCWriter;
pageid!: string;
// TODO: Fix this the next time the file is edited.
constructor(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: any}
) {
this.workerid = workerid; this.workerid = workerid;
this.crawler = crawler; this.crawler = crawler;
this.crawlState = crawler.crawlState; this.crawlState = crawler.crawlState;
@ -43,19 +90,6 @@ export class Recorder
this.fetcherQ = new PQueue({concurrency: 1}); this.fetcherQ = new PQueue({concurrency: 1});
this.pendingRequests = null;
this.skipIds = null;
this.swSessionId = null;
this.swFrameIds = new Set();
this.swUrls = new Set();
this.logDetails = {};
this.skipping = false;
this.allowFull206 = true;
this.collDir = collDir; this.collDir = collDir;
this.archivesDir = path.join(this.collDir, "archive"); this.archivesDir = path.join(this.collDir, "archive");
@ -68,7 +102,6 @@ export class Recorder
const crawlId = process.env.CRAWL_ID || os.hostname(); const crawlId = process.env.CRAWL_ID || os.hostname();
const filename = `rec-${crawlId}-${timestampNow()}-${this.workerid}.warc`; const filename = `rec-${crawlId}-${timestampNow()}-${this.workerid}.warc`;
this.gzip = true;
this.writer = new WARCWriter({ this.writer = new WARCWriter({
archivesDir: this.archivesDir, archivesDir: this.archivesDir,
@ -79,7 +112,7 @@ export class Recorder
}); });
} }
async onCreatePage({cdp}) { async onCreatePage({cdp} : {cdp: CDPSession}) {
// Fetch // Fetch
cdp.on("Fetch.requestPaused", async (params) => { cdp.on("Fetch.requestPaused", async (params) => {
@ -159,7 +192,7 @@ export class Recorder
await cdp.send("Target.setAutoAttach", {autoAttach: true, waitForDebuggerOnStart: false, flatten: true}); await cdp.send("Target.setAutoAttach", {autoAttach: true, waitForDebuggerOnStart: false, flatten: true});
} }
handleResponseReceived(params) { handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
const { requestId, response } = params; const { requestId, response } = params;
const reqresp = this.pendingReqResp(requestId); const reqresp = this.pendingReqResp(requestId);
@ -170,7 +203,7 @@ export class Recorder
reqresp.fillResponse(response); reqresp.fillResponse(response);
} }
handleRequestExtraInfo(params) { handleRequestExtraInfo(params: Protocol.Network.RequestWillBeSentExtraInfoEvent) {
if (!this.shouldSkip(params.headers)) { if (!this.shouldSkip(params.headers)) {
const reqresp = this.pendingReqResp(params.requestId, true); const reqresp = this.pendingReqResp(params.requestId, true);
if (reqresp) { if (reqresp) {
@ -179,13 +212,13 @@ export class Recorder
} }
} }
handleRedirectResponse(params) { handleRedirectResponse(params: Protocol.Network.RequestWillBeSentEvent) {
const { requestId, redirectResponse } = params; const { requestId, redirectResponse } = params;
// remove and serialize, but allow reusing requestId // remove and serialize, but allow reusing requestId
// as redirect chain may reuse same requestId for subsequent request // as redirect chain may reuse same requestId for subsequent request
const reqresp = this.removeReqResp(requestId, true); const reqresp = this.removeReqResp(requestId, true);
if (!reqresp) { if (!reqresp || !redirectResponse) {
return; return;
} }
@ -199,7 +232,7 @@ export class Recorder
this.serializeToWARC(reqresp); this.serializeToWARC(reqresp);
} }
handleLoadingFailed(params) { handleLoadingFailed(params: Protocol.Network.LoadingFailedEvent) {
const { errorText, type, requestId } = params; const { errorText, type, requestId } = params;
const reqresp = this.pendingReqResp(requestId, true); const reqresp = this.pendingReqResp(requestId, true);
@ -211,13 +244,13 @@ export class Recorder
switch (errorText) { switch (errorText) {
case "net::ERR_BLOCKED_BY_CLIENT": case "net::ERR_BLOCKED_BY_CLIENT":
logNetwork("Request blocked", {url, errorText, ...this.logDetails}, "recorder"); logNetwork("Request blocked", {url, errorText, ...this.logDetails});
break; break;
case "net::ERR_ABORTED": case "net::ERR_ABORTED":
// check if this is a false positive -- a valid download that's already been fetched // check if this is a false positive -- a valid download that's already been fetched
// the abort is just for page, but download will succeed // the abort is just for page, but download will succeed
if (url && type === "Document" && reqresp.isValidBinary()) { if (type === "Document" && reqresp.isValidBinary()) {
this.serializeToWARC(reqresp); this.serializeToWARC(reqresp);
//} else if (url) { //} else if (url) {
} else if (url && reqresp.requestHeaders && reqresp.requestHeaders["x-browsertrix-fetch"]) { } else if (url && reqresp.requestHeaders && reqresp.requestHeaders["x-browsertrix-fetch"]) {
@ -235,7 +268,7 @@ export class Recorder
this.removeReqResp(requestId); this.removeReqResp(requestId);
} }
handleLoadingFinished(params) { handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
const reqresp = this.pendingReqResp(params.requestId, true); const reqresp = this.pendingReqResp(params.requestId, true);
if (!reqresp || reqresp.asyncLoading) { if (!reqresp || reqresp.asyncLoading) {
@ -251,7 +284,7 @@ export class Recorder
this.serializeToWARC(reqresp); this.serializeToWARC(reqresp);
} }
async handleRequestPaused(params, cdp, isSWorker = false) { async handleRequestPaused(params: Protocol.Fetch.RequestPausedEvent, cdp: CDPSession, isSWorker = false) {
const { requestId, request, responseStatusCode, responseErrorReason, resourceType, networkId } = params; const { requestId, request, responseStatusCode, responseErrorReason, resourceType, networkId } = params;
const { method, headers, url } = request; const { method, headers, url } = request;
@ -276,7 +309,7 @@ export class Recorder
} }
} }
async handleFetchResponse(params, cdp, isSWorker) { async handleFetchResponse(params: Protocol.Fetch.RequestPausedEvent, cdp: CDPSession, isSWorker: boolean) {
const { request } = params; const { request } = params;
const { url } = request; const { url } = request;
const {requestId, responseErrorReason, responseStatusCode, responseHeaders} = params; const {requestId, responseErrorReason, responseStatusCode, responseHeaders} = params;
@ -341,7 +374,7 @@ export class Recorder
// if not consumed via takeStream, attempt async loading // if not consumed via takeStream, attempt async loading
if (!streamingConsume) { if (!streamingConsume) {
let fetcher = null; let fetcher : AsyncFetcher;
if (reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) { if (reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
fetcher = new AsyncFetcher(opts); fetcher = new AsyncFetcher(opts);
@ -388,12 +421,12 @@ export class Recorder
try { try {
await cdp.send("Fetch.fulfillRequest", { await cdp.send("Fetch.fulfillRequest", {
requestId, requestId,
responseCode: responseStatusCode, responseCode: responseStatusCode || 0,
responseHeaders, responseHeaders,
body body
}); });
} catch (e) { } catch (e) {
const type = reqresp.type; const type = reqresp.resourceType;
if (type === "Document") { if (type === "Document") {
logger.debug("document not loaded in browser, possibly other URLs missing", {url, type: reqresp.resourceType}, "recorder"); logger.debug("document not loaded in browser, possibly other URLs missing", {url, type: reqresp.resourceType}, "recorder");
} else { } else {
@ -404,7 +437,7 @@ export class Recorder
return true; return true;
} }
startPage({pageid, url}) { startPage({pageid, url} : {pageid: string, url: string}) {
this.pageid = pageid; this.pageid = pageid;
this.logDetails = {page: url, workerid: this.workerid}; this.logDetails = {page: url, workerid: this.workerid};
if (this.pendingRequests && this.pendingRequests.size) { if (this.pendingRequests && this.pendingRequests.size) {
@ -431,8 +464,8 @@ export class Recorder
while (numPending && !this.crawler.interrupted) { while (numPending && !this.crawler.interrupted) {
const pending = []; const pending = [];
for (const [requestId, reqresp] of this.pendingRequests.entries()) { for (const [requestId, reqresp] of this.pendingRequests.entries()) {
const url = reqresp.url; const url = reqresp.url || "";
const entry = {requestId, url}; const entry : {requestId: string, url: string, expectedSize?: number, readSize?: number} = {requestId, url};
if (reqresp.expectedSize) { if (reqresp.expectedSize) {
entry.expectedSize = reqresp.expectedSize; entry.expectedSize = reqresp.expectedSize;
} }
@ -464,7 +497,7 @@ export class Recorder
await this.writer.flush(); await this.writer.flush();
} }
shouldSkip(headers, url, method, resourceType) { shouldSkip(headers: Protocol.Network.Headers, url?: string, method?: string, resourceType?: string) {
if (headers && !method) { if (headers && !method) {
method = headers[":method"]; method = headers[":method"];
} }
@ -477,7 +510,7 @@ export class Recorder
return true; return true;
} }
if (["EventSource", "WebSocket", "Ping"].includes(resourceType)) { if (["EventSource", "WebSocket", "Ping"].includes(resourceType || "")) {
return true; return true;
} }
@ -494,7 +527,7 @@ export class Recorder
return false; return false;
} }
async rewriteResponse(reqresp) { async rewriteResponse(reqresp: RequestResponseInfo) {
const { url, responseHeadersList, extraOpts, payload } = reqresp; const { url, responseHeadersList, extraOpts, payload } = reqresp;
if (!payload || !payload.length) { if (!payload || !payload.length) {
@ -509,12 +542,12 @@ export class Recorder
switch (ct) { switch (ct) {
case "application/x-mpegURL": case "application/x-mpegURL":
case "application/vnd.apple.mpegurl": case "application/vnd.apple.mpegurl":
string = payload.toString("utf-8"); string = payload.toString();
newString = rewriteHLS(string, {save: extraOpts}); newString = rewriteHLS(string, {save: extraOpts});
break; break;
case "application/dash+xml": case "application/dash+xml":
string = payload.toString("utf-8"); string = payload.toString();
newString = rewriteDASH(string, {save: extraOpts}); newString = rewriteDASH(string, {save: extraOpts});
break; break;
@ -526,7 +559,7 @@ export class Recorder
const rw = baseDSRules.getRewriter(url); const rw = baseDSRules.getRewriter(url);
if (rw !== baseDSRules.defaultRewriter) { if (rw !== baseDSRules.defaultRewriter) {
string = payload.toString("utf-8"); string = payload.toString();
newString = rw.rewrite(string, {live: true, save: extraOpts}); newString = rw.rewrite(string, {live: true, save: extraOpts});
} }
break; break;
@ -549,8 +582,11 @@ export class Recorder
//return Buffer.from(newString).toString("base64"); //return Buffer.from(newString).toString("base64");
} }
_getContentType(headers) { _getContentType(headers? : Protocol.Fetch.HeaderEntry[] | {name: string, value: string}[]) {
for (let header of headers) { if (!headers) {
return null;
}
for (const header of headers) {
if (header.name.toLowerCase() === "content-type") { if (header.name.toLowerCase() === "content-type") {
return header.value.split(";")[0]; return header.value.split(";")[0];
} }
@ -559,8 +595,11 @@ export class Recorder
return null; return null;
} }
_getContentLen(headers) { _getContentLen(headers? : Protocol.Fetch.HeaderEntry[]) {
for (let header of headers) { if (!headers) {
return -1;
}
for (const header of headers) {
if (header.name.toLowerCase() === "content-length") { if (header.name.toLowerCase() === "content-length") {
return Number(header.value); return Number(header.value);
} }
@ -569,8 +608,11 @@ export class Recorder
return -1; return -1;
} }
_getContentRange(headers) { _getContentRange(headers? : Protocol.Fetch.HeaderEntry[]) {
for (let header of headers) { if (!headers) {
return null;
}
for (const header of headers) {
if (header.name.toLowerCase() === "content-range") { if (header.name.toLowerCase() === "content-range") {
return header.value; return header.value;
} }
@ -579,15 +621,15 @@ export class Recorder
return null; return null;
} }
noResponseForStatus(status) { noResponseForStatus(status: number | undefined | null) {
return (!status || status === 204 || (status >= 300 && status < 400)); return (!status || status === 204 || (status >= 300 && status < 400));
} }
isValidUrl(url) { isValidUrl(url?: string) {
return url && (url.startsWith("https:") || url.startsWith("http:")); return url && (url.startsWith("https:") || url.startsWith("http:"));
} }
pendingReqResp(requestId, reuseOnly = false) { pendingReqResp(requestId: string, reuseOnly = false) {
if (!this.pendingRequests.has(requestId)) { if (!this.pendingRequests.has(requestId)) {
if (reuseOnly || !requestId) { if (reuseOnly || !requestId) {
return null; return null;
@ -605,14 +647,14 @@ export class Recorder
return reqresp; return reqresp;
} else { } else {
const reqresp = this.pendingRequests.get(requestId); const reqresp = this.pendingRequests.get(requestId);
if (requestId !== reqresp.requestId) { if (reqresp && requestId !== reqresp.requestId) {
logger.warn("Invalid request id", {requestId, actualRequestId: reqresp.requestId}, "recorder"); logger.warn("Invalid request id", {requestId, actualRequestId: reqresp.requestId}, "recorder");
} }
return reqresp; return reqresp;
} }
} }
removeReqResp(requestId, allowReuse=false) { removeReqResp(requestId: string, allowReuse=false) {
const reqresp = this.pendingRequests.get(requestId); const reqresp = this.pendingRequests.get(requestId);
this.pendingRequests.delete(requestId); this.pendingRequests.delete(requestId);
if (!allowReuse) { if (!allowReuse) {
@ -621,13 +663,13 @@ export class Recorder
return reqresp; return reqresp;
} }
async serializeToWARC(reqresp) { async serializeToWARC(reqresp: RequestResponseInfo) {
if (!reqresp.payload) { if (!reqresp.payload) {
logNetwork("Not writing, no payload", {url: reqresp.url}); logNetwork("Not writing, no payload", {url: reqresp.url});
return; return;
} }
if (reqresp.method === "GET" && !await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url)) { if (reqresp.url && reqresp.method === "GET" && !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))) {
logNetwork("Skipping dupe", {url: reqresp.url}); logNetwork("Skipping dupe", {url: reqresp.url});
return; return;
} }
@ -638,21 +680,21 @@ export class Recorder
this.warcQ.add(() => this.writer.writeRecordPair(responseRecord, requestRecord)); this.warcQ.add(() => this.writer.writeRecordPair(responseRecord, requestRecord));
} }
async directFetchCapture(url) { async directFetchCapture(url: string) : Promise<{fetched: boolean, mime: string}>{
const reqresp = new RequestResponseInfo(0); const reqresp = new RequestResponseInfo("0");
reqresp.url = url; reqresp.url = url;
reqresp.method = "GET"; reqresp.method = "GET";
logger.debug("Directly fetching page URL without browser", {url, ...this.logDetails}, "recorder"); logger.debug("Directly fetching page URL without browser", {url, ...this.logDetails}, "recorder");
const filter = (resp) => resp.status === 200 && !resp.headers.get("set-cookie"); const filter = (resp: Response) => resp.status === 200 && !resp.headers.get("set-cookie");
// ignore dupes: if previous URL was not a page, still load as page. if previous was page, // ignore dupes: if previous URL was not a page, still load as page. if previous was page,
// should not get here, as dupe pages tracked via seen list // should not get here, as dupe pages tracked via seen list
const fetcher = new AsyncFetcher({tempdir: this.tempdir, reqresp, recorder: this, networkId: 0, filter, ignoreDupe: true}); const fetcher = new AsyncFetcher({tempdir: this.tempdir, reqresp, recorder: this, networkId: "0", filter, ignoreDupe: true});
const res = await fetcher.load(); const res = await fetcher.load();
const mime = reqresp && reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"].split(";")[0]; const mime = reqresp && reqresp.responseHeaders && reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"].split(";")[0] || "";
return {fetched: res === "fetched", mime}; return {fetched: res === "fetched", mime};
} }
@ -661,7 +703,20 @@ export class Recorder
// ================================================================= // =================================================================
class AsyncFetcher class AsyncFetcher
{ {
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = null, ignoreDupe = false}) { reqresp: RequestResponseInfo;
networkId: string;
filter?: (resp: Response) => boolean;
ignoreDupe = false;
recorder: Recorder;
tempdir: string;
filename: string;
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = undefined, ignoreDupe = false} :
{tempdir: string, reqresp: RequestResponseInfo, expectedSize?: number, recorder: Recorder,
networkId: string, filter?: (resp: Response) => boolean, ignoreDupe?: boolean }) {
this.reqresp = reqresp; this.reqresp = reqresp;
this.reqresp.expectedSize = expectedSize; this.reqresp.expectedSize = expectedSize;
this.reqresp.asyncLoading = true; this.reqresp.asyncLoading = true;
@ -685,7 +740,7 @@ class AsyncFetcher
let fetched = "notfetched"; let fetched = "notfetched";
try { try {
if (reqresp.method === "GET" && !await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url)) { if (reqresp.method === "GET" && url && !(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url))) {
if (!this.ignoreDupe) { if (!this.ignoreDupe) {
this.reqresp.asyncLoading = false; this.reqresp.asyncLoading = false;
return "dupe"; return "dupe";
@ -719,7 +774,7 @@ class AsyncFetcher
//return fetched; //return fetched;
} }
const externalBuffer = serializer.externalBuffer; const externalBuffer : TempFileBuffer = serializer.externalBuffer as TempFileBuffer;
if (externalBuffer) { if (externalBuffer) {
const { currSize, buffers, fh } = externalBuffer; const { currSize, buffers, fh } = externalBuffer;
@ -731,14 +786,14 @@ class AsyncFetcher
} }
if (Object.keys(reqresp.extraOpts).length) { if (Object.keys(reqresp.extraOpts).length) {
responseRecord.warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts); responseRecord.warcHeaders.headers.set("WARC-JSON-Metadata", JSON.stringify(reqresp.extraOpts));
} }
recorder.warcQ.add(() => recorder.writer.writeRecordPair(responseRecord, requestRecord, serializer)); recorder.warcQ.add(() => recorder.writer.writeRecordPair(responseRecord, requestRecord, serializer));
} catch (e) { } catch (e) {
logger.error("Streaming Fetch Error", {url, networkId, filename, ...errJSON(e), ...logDetails}, "recorder"); logger.error("Streaming Fetch Error", {url, networkId, filename, ...errJSON(e), ...logDetails}, "recorder");
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url); await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!);
} finally { } finally {
recorder.removeReqResp(networkId); recorder.removeReqResp(networkId);
} }
@ -761,9 +816,9 @@ class AsyncFetcher
signal = abort.signal; signal = abort.signal;
} }
const resp = await fetch(url, {method, headers, body: reqresp.postData || undefined, signal}); const resp = await fetch(url!, {method, headers, body: reqresp.postData || undefined, signal});
if (this.filter && !this.filter(resp)) { if (this.filter && !this.filter(resp) && abort) {
abort.abort(); abort.abort();
throw new Error("invalid response, ignoring fetch"); throw new Error("invalid response, ignoring fetch");
} }
@ -778,7 +833,7 @@ class AsyncFetcher
} else if (!resp.body) { } else if (!resp.body) {
logger.error("Empty body, stopping fetch", {url}, "recorder"); logger.error("Empty body, stopping fetch", {url}, "recorder");
await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url); await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!);
return; return;
} }
@ -787,7 +842,7 @@ class AsyncFetcher
return this.takeReader(resp.body.getReader()); return this.takeReader(resp.body.getReader());
} }
async* takeReader(reader) { async* takeReader(reader: ReadableStreamDefaultReader<Uint8Array>) {
try { try {
while (true) { while (true) {
const { value, done } = await reader.read(); const { value, done } = await reader.read();
@ -803,7 +858,7 @@ class AsyncFetcher
} }
} }
async* takeStreamIter(cdp, stream) { async* takeStreamIter(cdp: CDPSession, stream: Protocol.IO.StreamHandle) {
try { try {
while (true) { while (true) {
const {data, base64Encoded, eof} = await cdp.send("IO.read", {handle: stream}); const {data, base64Encoded, eof} = await cdp.send("IO.read", {handle: stream});
@ -825,7 +880,12 @@ class AsyncFetcher
// ================================================================= // =================================================================
class ResponseStreamAsyncFetcher extends AsyncFetcher class ResponseStreamAsyncFetcher extends AsyncFetcher
{ {
constructor(opts) { cdp: CDPSession;
requestId: string;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(opts: any) {
super(opts); super(opts);
this.cdp = opts.cdp; this.cdp = opts.cdp;
this.requestId = opts.requestId; this.requestId = opts.requestId;
@ -845,7 +905,11 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher
// ================================================================= // =================================================================
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
{ {
constructor(opts) { cdp: CDPSession;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(opts: any) {
super(opts); super(opts);
this.cdp = opts.cdp; this.cdp = opts.cdp;
} }
@ -883,7 +947,7 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
return; return;
} }
reqresp.status = httpStatusCode; reqresp.status = httpStatusCode || 0;
reqresp.responseHeaders = headers || {}; reqresp.responseHeaders = headers || {};
return this.takeStreamIter(cdp, stream); return this.takeStreamIter(cdp, stream);
@ -892,15 +956,15 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
// ================================================================= // =================================================================
// response // response
function createResponse(reqresp, pageid, contentIter) { function createResponse(reqresp: RequestResponseInfo, pageid: string, contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>) {
const url = reqresp.url; const url = reqresp.url;
const warcVersion = "WARC/1.1"; const warcVersion = "WARC/1.1";
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`; const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
const date = new Date().toISOString(); const date = new Date().toISOString();
const httpHeaders = reqresp.getResponseHeadersDict(reqresp.payload ? reqresp.payload.length : null); const httpHeaders = reqresp.getResponseHeadersDict(reqresp.payload ? reqresp.payload.length : 0);
const warcHeaders = { const warcHeaders : Record<string, string> = {
"WARC-Page-ID": pageid, "WARC-Page-ID": pageid,
}; };
@ -909,7 +973,7 @@ function createResponse(reqresp, pageid, contentIter) {
} }
if (!contentIter) { if (!contentIter) {
contentIter = [reqresp.payload]; contentIter = [reqresp.payload] as Iterable<Uint8Array>;
} }
if (Object.keys(reqresp.extraOpts).length) { if (Object.keys(reqresp.extraOpts).length) {
@ -923,7 +987,7 @@ function createResponse(reqresp, pageid, contentIter) {
// ================================================================= // =================================================================
// request // request
function createRequest(reqresp, responseRecord, pageid) { function createRequest(reqresp: RequestResponseInfo, responseRecord: WARCRecord, pageid: string) {
const url = reqresp.url; const url = reqresp.url;
const warcVersion = "WARC/1.1"; const warcVersion = "WARC/1.1";
const method = reqresp.method; const method = reqresp.method;
@ -936,12 +1000,12 @@ function createRequest(reqresp, responseRecord, pageid) {
const httpHeaders = reqresp.getRequestHeadersDict(); const httpHeaders = reqresp.getRequestHeadersDict();
const warcHeaders = { const warcHeaders : Record<string, string> = {
"WARC-Concurrent-To": responseRecord.warcHeader("WARC-Record-ID"), "WARC-Concurrent-To": responseRecord.warcHeader("WARC-Record-ID")!,
"WARC-Page-ID": pageid, "WARC-Page-ID": pageid,
}; };
const date = responseRecord.warcDate; const date = responseRecord.warcDate || undefined;
return WARCRecord.create({ return WARCRecord.create({
url, date, warcVersion, type: "request", warcHeaders, url, date, warcVersion, type: "request", warcHeaders,

View file

@ -1,4 +1,4 @@
import Redis from "ioredis"; import { Redis } from "ioredis";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
const error = console.error; const error = console.error;
@ -15,7 +15,7 @@ console.error = function (...args) {
args[0].indexOf("[ioredis] Unhandled error event") === 0 args[0].indexOf("[ioredis] Unhandled error event") === 0
) { ) {
let now = Date.now(); const now = Date.now();
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) { if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
if (lastLogTime && exitOnError) { if (lastLogTime && exitOnError) {
@ -29,7 +29,7 @@ console.error = function (...args) {
error.call(console, ...args); error.call(console, ...args);
}; };
export async function initRedis(url) { export async function initRedis(url: string) {
const redis = new Redis(url, {lazyConnect: true}); const redis = new Redis(url, {lazyConnect: true});
await redis.connect(); await redis.connect();
return redis; return redis;

View file

@ -1,5 +1,8 @@
// @ts-expect-error TODO fill in why error is expected
import { getStatusText } from "@webrecorder/wabac/src/utils.js"; import { getStatusText } from "@webrecorder/wabac/src/utils.js";
import { Protocol } from "puppeteer-core";
const CONTENT_LENGTH = "content-length"; const CONTENT_LENGTH = "content-length";
const CONTENT_TYPE = "content-type"; const CONTENT_TYPE = "content-type";
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"]; const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
@ -8,53 +11,63 @@ const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
// =========================================================================== // ===========================================================================
export class RequestResponseInfo export class RequestResponseInfo
{ {
constructor(requestId) { _created: Date = new Date();
this._created = new Date();
requestId: string;
ts?: string;
method?: string;
url!: string;
protocol?: string = "HTTP/1.1";
// request data
requestHeaders?: Record<string, string>;
requestHeadersText?: string;
postData?: string;
hasPostData: boolean = false;
// response data
status: number = 0;
statusText?: string;
responseHeaders?: Record<string, string>;
responseHeadersList?: {name: string, value: string}[];
responseHeadersText?: string;
payload?: Uint8Array;
// misc
fromServiceWorker: boolean = false;
frameId?: string;
fetch: boolean = false;
resourceType?: string;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
extraOpts: Record<string, any> = {};
// stats
readSize: number = 0;
expectedSize: number = 0;
// set to true to indicate async loading in progress
asyncLoading: boolean = false;
// set to add truncated message
truncated?: string;
constructor(requestId: string) {
this.requestId = requestId; this.requestId = requestId;
this.ts = null;
// request data
this.method = null;
this.url = null;
this.protocol = "HTTP/1.1";
this.requestHeaders = null;
this.requestHeadersText = null;
this.postData = null;
this.hasPostData = false;
// response data
this.status = 0;
this.statusText = null;
this.responseHeaders = null;
this.responseHeadersList = null;
this.responseHeadersText = null;
this.payload = null;
this.fromServiceWorker = false;
this.fetch = false;
this.resourceType = null;
this.extraOpts = {};
this.readSize = 0;
this.expectedSize = 0;
// set to true to indicate async loading in progress
this.asyncLoading = false;
// set to add truncated message
this.truncated = null;
} }
fillRequest(params) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
fillRequest(params: Record<string, any>) {
this.url = params.request.url; this.url = params.request.url;
this.method = params.request.method; this.method = params.request.method;
if (!this.requestHeaders) { if (!this.requestHeaders) {
@ -69,7 +82,9 @@ export class RequestResponseInfo
} }
fillFetchRequestPaused(params) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
fillFetchRequestPaused(params: Record<string, any>) {
this.fillRequest(params); this.fillRequest(params);
this.status = params.responseStatusCode; this.status = params.responseStatusCode;
@ -83,7 +98,7 @@ export class RequestResponseInfo
this.frameId = params.frameId; this.frameId = params.frameId;
} }
fillResponse(response) { fillResponse(response: Protocol.Network.Response) {
// if initial fetch was a 200, but now replacing with 304, don't! // if initial fetch was a 200, but now replacing with 304, don't!
if (response.status == 304 && this.status && this.status != 304 && this.url) { if (response.status == 304 && this.status && this.status != 304 && this.url) {
return; return;
@ -112,8 +127,8 @@ export class RequestResponseInfo
this.fromServiceWorker = !!response.fromServiceWorker; this.fromServiceWorker = !!response.fromServiceWorker;
if (response.securityDetails) { if (response.securityDetails) {
const issuer = response.securityDetails.issuer || ""; const issuer : string = response.securityDetails.issuer || "";
const ctc = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0"; const ctc : string = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0";
this.extraOpts.cert = {issuer, ctc}; this.extraOpts.cert = {issuer, ctc};
} }
} }
@ -124,14 +139,15 @@ export class RequestResponseInfo
} }
try { try {
const headers = new Headers(this.responseHeaders); const headers = new Headers(this.responseHeaders);
const redirUrl = new URL(headers.get("location"), this.url).href; const location = headers.get("location") || "";
const redirUrl = new URL(location, this.url).href;
return this.url === redirUrl; return this.url === redirUrl;
} catch (e) { } catch (e) {
return false; return false;
} }
} }
fillResponseReceivedExtraInfo(params) { fillResponseReceivedExtraInfo(params: Record<string, string>) {
// this.responseHeaders = params.headers; // this.responseHeaders = params.headers;
// if (params.headersText) { // if (params.headersText) {
// this.responseHeadersText = params.headersText; // this.responseHeadersText = params.headersText;
@ -139,22 +155,28 @@ export class RequestResponseInfo
this.extraOpts.ipType = params.resourceIPAddressSpace; this.extraOpts.ipType = params.resourceIPAddressSpace;
} }
fillFetchResponse(response) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
fillFetchResponse(response: Record<string, any>) {
this.responseHeaders = Object.fromEntries(response.headers); this.responseHeaders = Object.fromEntries(response.headers);
this.status = response.status; this.status = response.status;
this.statusText = response.statusText || getStatusText(this.status); this.statusText = response.statusText || getStatusText(this.status);
} }
fillRequestExtraInfo(params) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
fillRequestExtraInfo(params: Record<string, any>) {
this.requestHeaders = params.headers; this.requestHeaders = params.headers;
} }
getResponseHeadersText() { getResponseHeadersText() {
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`; let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
for (const header of Object.keys(this.responseHeaders)) { if (this.responseHeaders) {
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`; for (const header of Object.keys(this.responseHeaders)) {
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`;
}
} }
headers += "\r\n"; headers += "\r\n";
return headers; return headers;
@ -165,14 +187,14 @@ export class RequestResponseInfo
} }
getRequestHeadersDict() { getRequestHeadersDict() {
return this._getHeadersDict(this.requestHeaders, null); return this._getHeadersDict(this.requestHeaders);
} }
getResponseHeadersDict(length) { getResponseHeadersDict(length = 0) {
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length); return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
} }
_getHeadersDict(headersDict, headersList, actualContentLength) { _getHeadersDict(headersDict?: Record<string, string>, headersList?: {name: string, value: string}[], actualContentLength = 0) {
if (!headersDict && headersList) { if (!headersDict && headersList) {
headersDict = {}; headersDict = {};

View file

@ -1,29 +1,39 @@
import ws from "ws"; import ws, { WebSocket } from "ws";
import http from "http"; import http, { IncomingMessage, ServerResponse } from "http";
import url from "url"; import url from "url";
import fs from "fs"; import fs from "fs";
import { initRedis } from "./redis.js"; import { initRedis } from "./redis.js";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { Duplex } from "stream";
import { CDPSession, Page } from "puppeteer-core";
import { WorkerId } from "./state.js";
const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"}); const indexHTML = fs.readFileSync(new URL("../../html/screencast.html", import.meta.url), {encoding: "utf8"});
// =========================================================================== // ===========================================================================
class WSTransport class WSTransport
{ {
constructor(port) { allWS = new Set<WebSocket>();
this.allWS = new Set(); // eslint-disable-next-line no-use-before-define
caster!: ScreenCaster;
wss: ws.Server;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
httpServer: any;
this.caster = null; constructor(port: number) {
this.allWS = new Set();
this.wss = new ws.Server({ noServer: true }); this.wss = new ws.Server({ noServer: true });
this.wss.on("connection", (ws) => this.initWebSocket(ws)); this.wss.on("connection", (ws: WebSocket) => this.initWebSocket(ws));
this.httpServer = http.createServer((...args) => this.handleRequest(...args)); this.httpServer = http.createServer((...args) => this.handleRequest(...args));
this.httpServer.on("upgrade", (request, socket, head) => { this.httpServer.on("upgrade", (request: IncomingMessage, socket: Duplex, head: Buffer) => {
const pathname = url.parse(request.url).pathname; const pathname = url.parse(request.url || "").pathname;
if (pathname === "/ws") { if (pathname === "/ws") {
this.wss.handleUpgrade(request, socket, head, (ws) => { this.wss.handleUpgrade(request, socket, head, (ws) => {
@ -35,8 +45,8 @@ class WSTransport
this.httpServer.listen(port); this.httpServer.listen(port);
} }
async handleRequest(req, res) { async handleRequest(req: IncomingMessage, res: ServerResponse) {
const pathname = url.parse(req.url).pathname; const pathname = url.parse(req.url || "").pathname;
switch (pathname) { switch (pathname) {
case "/": case "/":
res.writeHead(200, {"Content-Type": "text/html"}); res.writeHead(200, {"Content-Type": "text/html"});
@ -48,7 +58,7 @@ class WSTransport
res.end("Not Found"); res.end("Not Found");
} }
initWebSocket(ws) { initWebSocket(ws: WebSocket) {
for (const packet of this.caster.iterCachedData()) { for (const packet of this.caster.iterCachedData()) {
ws.send(JSON.stringify(packet)); ws.send(JSON.stringify(packet));
} }
@ -71,10 +81,12 @@ class WSTransport
}); });
} }
sendAll(packet) { // TODO: Fix this the next time the file is edited.
packet = JSON.stringify(packet); // eslint-disable-next-line @typescript-eslint/no-explicit-any
sendAll(packet: Record<any, any>) {
const packetStr = JSON.stringify(packet);
for (const ws of this.allWS) { for (const ws of this.allWS) {
ws.send(packet); ws.send(packetStr);
} }
} }
@ -87,22 +99,30 @@ class WSTransport
// =========================================================================== // ===========================================================================
class RedisPubSubTransport class RedisPubSubTransport
{ {
constructor(redisUrl, crawlId) { numConnections: number = 0;
this.numConnections = 0; castChannel: string;
// eslint-disable-next-line no-use-before-define
caster!: ScreenCaster;
ctrlChannel: string;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
redis: any;
constructor(redisUrl: string, crawlId: string) {
this.castChannel = `c:${crawlId}:cast`; this.castChannel = `c:${crawlId}:cast`;
this.ctrlChannel = `c:${crawlId}:ctrl`; this.ctrlChannel = `c:${crawlId}:ctrl`;
this.init(redisUrl); this.init(redisUrl);
} }
async init(redisUrl) { async init(redisUrl: string) {
this.redis = await initRedis(redisUrl); this.redis = await initRedis(redisUrl);
const subRedis = await initRedis(redisUrl); const subRedis = await initRedis(redisUrl);
await subRedis.subscribe(this.ctrlChannel); await subRedis.subscribe(this.ctrlChannel);
subRedis.on("message", async (channel, message) => { subRedis.on("message", async (channel: string, message: string) => {
if (channel !== this.ctrlChannel) { if (channel !== this.ctrlChannel) {
return; return;
} }
@ -129,7 +149,9 @@ class RedisPubSubTransport
}); });
} }
async sendAll(packet) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
async sendAll(packet: Record<any, any>) {
await this.redis.publish(this.castChannel, JSON.stringify(packet)); await this.redis.publish(this.castChannel, JSON.stringify(packet));
} }
@ -143,19 +165,20 @@ class RedisPubSubTransport
// =========================================================================== // ===========================================================================
class ScreenCaster class ScreenCaster
{ {
constructor(transport, numWorkers) { transport: WSTransport;
caches = new Map<WorkerId, string>();
urls = new Map<WorkerId, string>();
cdps = new Map<WorkerId, CDPSession>();
maxWidth = 640;
maxHeight = 480;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
initMsg: {[key: string]: any};
constructor(transport: WSTransport, numWorkers: number) {
this.transport = transport; this.transport = transport;
this.transport.caster = this; this.transport.caster = this;
this.caches = new Map();
this.urls = new Map();
this.cdps = new Map();
// todo: make customizable
this.maxWidth = 640;
this.maxHeight = 480;
this.initMsg = { this.initMsg = {
msg: "init", msg: "init",
width: this.maxWidth, width: this.maxWidth,
@ -174,7 +197,7 @@ class ScreenCaster
} }
} }
async screencastPage(page, cdp, id) { async screencastPage(page: Page, cdp: CDPSession, id: WorkerId) {
this.urls.set(id, page.url()); this.urls.set(id, page.url());
// shouldn't happen, getting duplicate cdp // shouldn't happen, getting duplicate cdp
@ -220,7 +243,7 @@ class ScreenCaster
} }
} }
async stopById(id, sendClose=false) { async stopById(id: WorkerId, sendClose=false) {
this.caches.delete(id); this.caches.delete(id);
this.urls.delete(id); this.urls.delete(id);
@ -241,24 +264,32 @@ class ScreenCaster
this.cdps.delete(id); this.cdps.delete(id);
} }
async startCast(cdp, id) { async startCast(cdp: CDPSession, id: WorkerId) {
if (cdp._startedCast) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
if ((cdp as any)._startedCast) {
return; return;
} }
cdp._startedCast = true; // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(cdp as any)._startedCast = true;
logger.info("Started Screencast", {workerid: id}, "screencast"); logger.info("Started Screencast", {workerid: id}, "screencast");
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight}); await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
} }
async stopCast(cdp, id) { async stopCast(cdp: CDPSession, id: WorkerId) {
if (!cdp._startedCast) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
if (!(cdp as any)._startedCast) {
return; return;
} }
cdp._startedCast = false; // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(cdp as any)._startedCast = false;
logger.info("Stopping Screencast", {workerid: id}, "screencast"); logger.info("Stopping Screencast", {workerid: id}, "screencast");

View file

@ -2,11 +2,18 @@ import sharp from "sharp";
import { WARCResourceWriter } from "./warcresourcewriter.js"; import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger, errJSON } from "./logger.js"; import { logger, errJSON } from "./logger.js";
import { Browser } from "./browser.js";
// ============================================================================ // ============================================================================
export const screenshotTypes = { type ScreenShotType = {
type: string;
omitBackground: boolean;
fullPage: boolean;
}
export const screenshotTypes : Record<string, ScreenShotType> = {
"view": { "view": {
type: "png", type: "png",
omitBackground: true, omitBackground: true,
@ -24,10 +31,15 @@ export const screenshotTypes = {
} }
}; };
export class Screenshots extends WARCResourceWriter { export class Screenshots extends WARCResourceWriter {
browser: Browser;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
page: any;
constructor(opts) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(opts: any) {
super({...opts, warcName: "screenshots.warc.gz"}); super({...opts, warcName: "screenshots.warc.gz"});
this.browser = opts.browser; this.browser = opts.browser;
this.page = opts.page; this.page = opts.page;

View file

@ -1,10 +1,34 @@
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { MAX_DEPTH } from "./constants.js"; import { MAX_DEPTH } from "./constants.js";
type ScopeType =
| "prefix"
| "host"
| "domain"
| "page"
| "page-spa"
| "any"
| "custom";
export class ScopedSeed export class ScopedSeed
{ {
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) { url: string;
scopeType: ScopeType;
include: RegExp[];
exclude: RegExp[] = [];
allowHash = false;
depth = -1;
sitemap?: string | null;
extraHops = 0;
maxExtraHops = 0;
maxDepth = 0;
constructor(
{url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} :
{url: string, scopeType: ScopeType, include: string[], exclude?: string[], allowHash?: boolean, depth?: number, sitemap?: string | boolean | null, extraHops?: number}
) {
const parsedUrl = this.parseUrl(url); const parsedUrl = this.parseUrl(url);
if (!parsedUrl) { if (!parsedUrl) {
throw new Error("Invalid URL"); throw new Error("Invalid URL");
@ -19,8 +43,9 @@ export class ScopedSeed
} }
if (this.scopeType !== "custom") { if (this.scopeType !== "custom") {
[include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl); const [includeNew, allowHashNew] = this.scopeFromType(this.scopeType, parsedUrl);
this.include = [...include, ...this.include]; this.include = [...includeNew, ...this.include];
allowHash = allowHashNew;
} }
// for page scope, the depth is set to extraHops, as no other // for page scope, the depth is set to extraHops, as no other
@ -35,7 +60,10 @@ export class ScopedSeed
this.maxDepth = depth < 0 ? MAX_DEPTH : depth; this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
} }
parseRx(value) { //parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
parseRx(value : any) {
if (value === null || value === undefined || value === "") { if (value === null || value === undefined || value === "") {
return []; return [];
} else if (!(value instanceof Array)) { } else if (!(value instanceof Array)) {
@ -45,7 +73,7 @@ export class ScopedSeed
} }
} }
addExclusion(value) { addExclusion(value: string | RegExp) {
if (!value) { if (!value) {
return; return;
} }
@ -55,7 +83,7 @@ export class ScopedSeed
this.exclude.push(value); this.exclude.push(value);
} }
removeExclusion(value) { removeExclusion(value: string) {
for (let i = 0; i < this.exclude.length; i++) { for (let i = 0; i < this.exclude.length; i++) {
if (this.exclude[i].toString() == value.toString()) { if (this.exclude[i].toString() == value.toString()) {
this.exclude.splice(i, 1); this.exclude.splice(i, 1);
@ -64,7 +92,7 @@ export class ScopedSeed
} }
} }
parseUrl(url, logDetails = {}) { parseUrl(url: string, logDetails = {}) {
let parsedUrl = null; let parsedUrl = null;
try { try {
parsedUrl = new URL(url.trim()); parsedUrl = new URL(url.trim());
@ -81,18 +109,21 @@ export class ScopedSeed
return parsedUrl; return parsedUrl;
} }
resolveSiteMap(sitemap) { resolveSiteMap(sitemap: boolean | string | null) : string | null {
if (sitemap === true) { if (sitemap === true) {
const url = new URL(this.url); const url = new URL(this.url);
url.pathname = "/sitemap.xml"; url.pathname = "/sitemap.xml";
return url.href; return url.href;
} else if (typeof(sitemap) === "string") {
const url = new URL(sitemap, this.url);
return url.href;
} }
return sitemap; return null;
} }
scopeFromType(scopeType, parsedUrl) { scopeFromType(scopeType: ScopeType, parsedUrl: URL) : [RegExp[], boolean] {
let include; let include : RegExp[] = [];
let allowHash = false; let allowHash = false;
switch (scopeType) { switch (scopeType) {
@ -132,26 +163,26 @@ export class ScopedSeed
return [include, allowHash]; return [include, allowHash];
} }
isAtMaxDepth(depth) { isAtMaxDepth(depth: number) {
return depth >= this.maxDepth; return depth >= this.maxDepth;
} }
isIncluded(url, depth, extraHops = 0, logDetails = {}) { isIncluded(url: string, depth: number, extraHops = 0, logDetails = {}) {
if (depth > this.maxDepth) { if (depth > this.maxDepth) {
return false; return false;
} }
url = this.parseUrl(url, logDetails); const urlParsed = this.parseUrl(url, logDetails);
if (!url) { if (!urlParsed) {
return false; return false;
} }
if (!this.allowHash) { if (!this.allowHash) {
// remove hashtag // remove hashtag
url.hash = ""; urlParsed.hash = "";
} }
url = url.href; url = urlParsed.href;
if (url === this.url) { if (url === this.url) {
return true; return true;
@ -194,11 +225,11 @@ export class ScopedSeed
} }
} }
export function rxEscape(string) { export function rxEscape(string: string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&"); return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
} }
export function urlRxEscape(url, parsedUrl) { export function urlRxEscape(url: string, parsedUrl: URL) {
return rxEscape(url).replace(parsedUrl.protocol, "https?:"); return rxEscape(url).replace(parsedUrl.protocol, "https?:");
} }

View file

@ -1,60 +1,144 @@
import { Redis, Result, Callback } from "ioredis";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { MAX_DEPTH } from "./constants.js"; import { MAX_DEPTH } from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core";
// ============================================================================ // ============================================================================
export const LoadState = { export enum LoadState {
FAILED: 0, FAILED = 0,
CONTENT_LOADED: 1, CONTENT_LOADED = 1,
FULL_PAGE_LOADED: 2, FULL_PAGE_LOADED = 2,
EXTRACTION_DONE: 3, EXTRACTION_DONE = 3,
BEHAVIORS_DONE: 4, BEHAVIORS_DONE = 4,
}; }
// ============================================================================ // ============================================================================
export const QueueState = { export enum QueueState {
ADDED: 0, ADDED = 0,
LIMIT_HIT: 1, LIMIT_HIT = 1,
DUPE_URL: 2, DUPE_URL = 2,
}; }
// ============================================================================
export type WorkerId = number;
// ============================================================================ // ============================================================================
export class PageState export class PageState
{ {
constructor(redisData) { url: string;
seedId: number;
depth: number;
extraHops: number;
workerid!: WorkerId;
pageid?: string;
title?: string;
mime?: string;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
callbacks: any;
isHTMLPage?: boolean;
text?: string;
favicon?: string;
skipBehaviors = false;
filteredFrames: Frame[] = [];
loadState : LoadState = LoadState.FAILED;
logDetails = {};
constructor(redisData: {url: string, seedId: number, depth: number, extraHops: number}) {
this.url = redisData.url; this.url = redisData.url;
this.seedId = redisData.seedId; this.seedId = redisData.seedId;
this.depth = redisData.depth; this.depth = redisData.depth;
this.extraHops = redisData.extraHops; this.extraHops = redisData.extraHops;
this.workerid = null;
this.pageid = null;
this.title = null;
this.isHTMLPage = null;
this.text = null;
this.skipBehaviors = false;
this.filteredFrames = [];
this.loadState = LoadState.FAILED;
this.logDetails = {};
} }
} }
// ============================================================================
declare module "ioredis" {
interface RedisCommander<Context> {
addqueue(
pkey: string,
qkey: string,
skey: string,
url: string,
score: number,
data: string,
limit: number,
): Result<number, Context>;
getnext(
qkey: string,
pkey: string,
): Result<string, Context>;
markstarted(
pkey: string,
pkeyUrl: string,
url: string,
started: string,
maxPageTime: number,
uid: string,
): Result<void, Context>;
movefailed(
pkey: string,
fkey: string,
url: string,
value: string,
state: string,
): Result<void, Context>;
unlockpending(
pkeyUrl: string,
uid: string,
callback?: Callback<string>
): Result<void, Context>;
requeue(
pkey: string,
qkey: string,
pkeyUrl: string,
url: string,
maxRetryPending: number,
): Result<number, Context>;
}
}
// ============================================================================ // ============================================================================
export class RedisCrawlState export class RedisCrawlState
{ {
constructor(redis, key, maxPageTime, uid) { redis: Redis;
maxRetryPending = 1;
_lastSize = 0;
uid: string;
key: string;
maxPageTime: number;
qkey: string;
pkey: string;
skey: string;
dkey: string;
fkey: string;
ekey: string;
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
this.redis = redis; this.redis = redis;
this.maxRetryPending = 1;
this._lastSize = 0;
this.uid = uid; this.uid = uid;
this.key = key; this.key = key;
@ -73,7 +157,7 @@ export class RedisCrawlState
this._initLuaCommands(this.redis); this._initLuaCommands(this.redis);
} }
_initLuaCommands(redis) { _initLuaCommands(redis: Redis) {
redis.defineCommand("addqueue", { redis.defineCommand("addqueue", {
numberOfKeys: 3, numberOfKeys: 3,
lua: ` lua: `
@ -184,58 +268,58 @@ return 0;
return new Date().toISOString(); return new Date().toISOString();
} }
async markStarted(url) { async markStarted(url: string) {
const started = this._timestamp(); const started = this._timestamp();
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime, this.uid); return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime, this.uid);
} }
async markFinished(url) { async markFinished(url: string) {
await this.redis.hdel(this.pkey, url); await this.redis.hdel(this.pkey, url);
return await this.redis.incr(this.dkey); return await this.redis.incr(this.dkey);
} }
async markFailed(url) { async markFailed(url: string) {
await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed"); await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
return await this.redis.incr(this.dkey); return await this.redis.incr(this.dkey);
} }
async markExcluded(url) { async markExcluded(url: string) {
await this.redis.hdel(this.pkey, url); await this.redis.hdel(this.pkey, url);
await this.redis.srem(this.skey, url); await this.redis.srem(this.skey, url);
} }
recheckScope(data, seeds) { recheckScope(data: {url: string, depth: number, extraHops: number, seedId: number}, seeds: ScopedSeed[]) {
const seed = seeds[data.seedId]; const seed = seeds[data.seedId];
return seed.isIncluded(data.url, data.depth, data.extraHops); return seed.isIncluded(data.url, data.depth, data.extraHops);
} }
async isFinished() { async isFinished() {
return (await this.queueSize() == 0) && (await this.numDone() > 0); return ((await this.queueSize()) == 0) && ((await this.numDone()) > 0);
} }
async setStatus(status_) { async setStatus(status_: string) {
await this.redis.hset(`${this.key}:status`, this.uid, status_); await this.redis.hset(`${this.key}:status`, this.uid, status_);
} }
async getStatus() { async getStatus() : Promise<string> {
return await this.redis.hget(`${this.key}:status`, this.uid); return (await this.redis.hget(`${this.key}:status`, this.uid)) || "";
} }
async setArchiveSize(size) { async setArchiveSize(size: number) {
return await this.redis.hset(`${this.key}:size`, this.uid, size); return await this.redis.hset(`${this.key}:size`, this.uid, size);
} }
async isCrawlStopped() { async isCrawlStopped() {
if (await this.redis.get(`${this.key}:stopping`) === "1") { if ((await this.redis.get(`${this.key}:stopping`)) === "1") {
return true; return true;
} }
if (await this.redis.hget(`${this.key}:stopone`, this.uid) === "1") { if ((await this.redis.hget(`${this.key}:stopone`, this.uid)) === "1") {
return true; return true;
} }
@ -243,7 +327,7 @@ return 0;
} }
async isCrawlCanceled() { async isCrawlCanceled() {
return await this.redis.get(`${this.key}:canceled`) === "1"; return (await this.redis.get(`${this.key}:canceled`)) === "1";
} }
// note: not currently called in crawler, but could be // note: not currently called in crawler, but could be
@ -252,7 +336,7 @@ return 0;
await this.redis.set(`${this.key}:stopping`, "1"); await this.redis.set(`${this.key}:stopping`, "1");
} }
async processMessage(seeds) { async processMessage(seeds: ScopedSeed[]) {
while (true) { while (true) {
const result = await this.redis.lpop(`${this.uid}:msg`); const result = await this.redis.lpop(`${this.uid}:msg`);
if (!result) { if (!result) {
@ -285,18 +369,20 @@ return 0;
} }
break; break;
} }
} catch (e) { } // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
catch (e: any) {
logger.warn("Error processing message", e, "redisMessage"); logger.warn("Error processing message", e, "redisMessage");
} }
} }
} }
isStrMatch(s) { isStrMatch(s: string) {
// if matches original string, then consider not a regex // if matches original string, then consider not a regex
return s.replace(/\\/g, "").replace(/[\\^$*+?.()|[\]{}]/g, "\\$&") === s; return s.replace(/\\/g, "").replace(/[\\^$*+?.()|[\]{}]/g, "\\$&") === s;
} }
filterQueue(regexStr) { filterQueue(regexStr: string) {
const regex = new RegExp(regexStr); const regex = new RegExp(regexStr);
let matcher = undefined; let matcher = undefined;
@ -325,7 +411,7 @@ return 0;
stream.resume(); stream.resume();
}); });
return new Promise(resolve => { return new Promise<void>(resolve => {
stream.on("end", () => { stream.on("end", () => {
resolve(); resolve();
}); });
@ -341,9 +427,12 @@ return 0;
return (res >= 3); return (res >= 3);
} }
async addToQueue({url, seedId, depth = 0, extraHops = 0} = {}, limit = 0) { //async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
async addToQueue({url, seedId, depth = 0, extraHops = 0} : {url: string, seedId: number, depth?: number, extraHops?: number}, limit = 0) {
const added = this._timestamp(); const added = this._timestamp();
const data = {added, url, seedId, depth}; // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const data : any = {added, url, seedId, depth};
if (extraHops) { if (extraHops) {
data.extraHops = extraHops; data.extraHops = extraHops;
} }
@ -375,8 +464,8 @@ return 0;
return new PageState(data); return new PageState(data);
} }
async has(url) { async has(url: string) {
return !!await this.redis.sismember(this.skey, url); return !!(await this.redis.sismember(this.skey, url));
} }
async serialize() { async serialize() {
@ -390,25 +479,25 @@ return 0;
return {done, queued, pending, failed, errors}; return {done, queued, pending, failed, errors};
} }
_getScore(data) { _getScore(data: {depth: number, extraHops: number}) {
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH; return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
} }
async _iterSortedKey(key, inc = 100) { async _iterSortedKey(key: string, inc = 100) {
const results = []; const results : string[] = [];
const len = await this.redis.zcard(key); const len = await this.redis.zcard(key);
for (let i = 0; i < len; i += inc) { for (let i = 0; i < len; i += inc) {
const someResults = await this.redis.zrangebyscore(key, 0, "inf", "limit", i, inc); const someResults = await this.redis.zrangebyscore(key, 0, "inf", "LIMIT", i, inc);
results.push(...someResults); results.push(...someResults);
} }
return results; return results;
} }
async _iterListKeys(key, inc = 100) { async _iterListKeys(key: string, inc = 100) {
const results = []; const results : string[] = [];
const len = await this.redis.llen(key); const len = await this.redis.llen(key);
@ -419,8 +508,10 @@ return 0;
return results; return results;
} }
async load(state, seeds, checkScope) { // TODO: Fix this the next time the file is edited.
const seen = []; // eslint-disable-next-line @typescript-eslint/no-explicit-any
async load(state: Record<string, any>, seeds: ScopedSeed[], checkScope: boolean) {
const seen : string[] = [];
// need to delete existing keys, if exist to fully reset state // need to delete existing keys, if exist to fully reset state
await this.redis.del(this.qkey); await this.redis.del(this.qkey);
@ -486,7 +577,7 @@ return 0;
async numDone() { async numDone() {
const done = await this.redis.get(this.dkey); const done = await this.redis.get(this.dkey);
return parseInt(done); return parseInt(done || "0");
} }
async numSeen() { async numSeen() {
@ -524,7 +615,9 @@ return 0;
for (const url of pendingUrls) { for (const url of pendingUrls) {
await this.redis.unlockpending(this.pkey + ":" + url, this.uid); await this.redis.unlockpending(this.pkey + ":" + url, this.uid);
} }
} catch (e) { } // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
catch (e: any) {
logger.error("Redis Del Pending Failed", e, "state"); logger.error("Redis Del Pending Failed", e, "state");
} }
} }
@ -551,15 +644,15 @@ return 0;
return this._lastSize; return this._lastSize;
} }
async addIfNoDupe(key, value) { async addIfNoDupe(key: string, value: string) {
return await this.redis.sadd(key, value) === 1; return (await this.redis.sadd(key, value)) === 1;
} }
async removeDupe(key, value) { async removeDupe(key: string, value: string) {
return await this.redis.srem(key, value); return await this.redis.srem(key, value);
} }
async logError(error) { async logError(error: string) {
return await this.redis.lpush(this.ekey, error); return await this.redis.lpush(this.ekey, error);
} }
} }

View file

@ -5,20 +5,40 @@ import util from "util";
import os from "os"; import os from "os";
import { createHash } from "crypto"; import { createHash } from "crypto";
import crc32 from "crc/crc32"; import crc32 from "crc/crc32";
import Minio from "minio"; import * as Minio from "minio";
import { initRedis } from "./redis.js"; import { initRedis } from "./redis.js";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
// @ts-expect-error TODO fill in why error is expected
import getFolderSize from "get-folder-size"; import getFolderSize from "get-folder-size";
// =========================================================================== // ===========================================================================
export class S3StorageSync export class S3StorageSync
{ {
constructor(urlOrData, {webhookUrl, userId, crawlId} = {}) { fullPrefix: string;
client: Minio.Client;
bucketName: string;
objectPrefix: string;
resources: object[] = [];
userId: string;
crawlId: string;
webhookUrl?: string;
// TODO: Fix this the next time the file is edited.
constructor(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
urlOrData: string | any,
{webhookUrl, userId, crawlId} :
{webhookUrl?: string, userId: string, crawlId: string}
) {
let url; let url;
let accessKey; let accessKey;
let secretKey; let secretKey;
@ -47,8 +67,6 @@ export class S3StorageSync
partSize: 100*1024*1024 partSize: 100*1024*1024
}); });
this.client.enableSHA256 = true;
this.bucketName = url.pathname.slice(1).split("/")[0]; this.bucketName = url.pathname.slice(1).split("/")[0];
this.objectPrefix = url.pathname.slice(this.bucketName.length + 2); this.objectPrefix = url.pathname.slice(this.bucketName.length + 2);
@ -60,12 +78,12 @@ export class S3StorageSync
this.webhookUrl = webhookUrl; this.webhookUrl = webhookUrl;
} }
async uploadFile(srcFilename, targetFilename) { async uploadFile(srcFilename: string, targetFilename: string) {
const fileUploadInfo = { const fileUploadInfo = {
"bucket": this.bucketName, "bucket": this.bucketName,
"crawlId": this.crawlId, "crawlId": this.crawlId,
"prefix": this.objectPrefix, "prefix": this.objectPrefix,
"targetFilename": this.targetFilename targetFilename
}; };
logger.info("S3 file upload information", fileUploadInfo, "s3Upload"); logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
@ -80,13 +98,13 @@ export class S3StorageSync
return {path, size, hash, crc32, bytes: size}; return {path, size, hash, crc32, bytes: size};
} }
async downloadFile(srcFilename, destFilename) { async downloadFile(srcFilename: string, destFilename: string) {
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename); await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
} }
async uploadCollWACZ(srcFilename, targetFilename, completed = true) { async uploadCollWACZ(srcFilename: string, targetFilename: string, completed = true) {
const resource = await this.uploadFile(srcFilename, targetFilename); const resource = await this.uploadFile(srcFilename, targetFilename);
logger.info("WACZ S3 file upload resource", {...targetFilename, resource}, "s3Upload"); logger.info("WACZ S3 file upload resource", {targetFilename, resource}, "s3Upload");
if (this.webhookUrl) { if (this.webhookUrl) {
const body = { const body = {
@ -130,8 +148,8 @@ export function initStorage() {
const opts = { const opts = {
crawlId: process.env.CRAWL_ID || os.hostname(), crawlId: process.env.CRAWL_ID || os.hostname(),
webhookUrl: process.env.WEBHOOK_URL, webhookUrl: process.env.WEBHOOK_URL || "",
userId: process.env.STORE_USER, userId: process.env.STORE_USER || "",
}; };
logger.info("Initing Storage..."); logger.info("Initing Storage...");
@ -139,12 +157,12 @@ export function initStorage() {
} }
export async function getFileSize(filename) { export async function getFileSize(filename: string) {
const stats = await fsp.stat(filename); const stats = await fsp.stat(filename);
return stats.size; return stats.size;
} }
export async function getDirSize(dir) { export async function getDirSize(dir: string) {
const { size, errors } = await getFolderSize(dir); const { size, errors } = await getFolderSize(dir);
if (errors && errors.length) { if (errors && errors.length) {
logger.warn("Size check errors", {errors}, "sizecheck"); logger.warn("Size check errors", {errors}, "sizecheck");
@ -152,8 +170,10 @@ export async function getDirSize(dir) {
return size; return size;
} }
export async function checkDiskUtilization(params, archiveDirSize, dfOutput=null) { // TODO: Fix this the next time the file is edited.
const diskUsage = await getDiskUsage("/crawls", dfOutput); // eslint-disable-next-line @typescript-eslint/no-explicit-any
export async function checkDiskUtilization(params: Record<string, any>, archiveDirSize: number, dfOutput=null) {
const diskUsage : Record<string, string> = await getDiskUsage("/crawls", dfOutput);
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1)); const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
// Check that disk usage isn't already above threshold // Check that disk usage isn't already above threshold
@ -199,19 +219,21 @@ export async function checkDiskUtilization(params, archiveDirSize, dfOutput=null
}; };
} }
export async function getDFOutput(path) { export async function getDFOutput(path: string) {
const exec = util.promisify(child_process.exec); const exec = util.promisify(child_process.exec);
const res = await exec(`df ${path}`); const res = await exec(`df ${path}`);
return res.stdout; return res.stdout;
} }
export async function getDiskUsage(path="/crawls", dfOutput = null) { export async function getDiskUsage(path="/crawls", dfOutput = null) {
const result = dfOutput || await getDFOutput(path); const result = dfOutput || (await getDFOutput(path));
const lines = result.split("\n"); const lines = result.split("\n");
const keys = lines[0].split(/\s+/ig); const keys = lines[0].split(/\s+/ig);
const rows = lines.slice(1).map(line => { const rows = lines.slice(1).map(line => {
const values = line.split(/\s+/ig); const values = line.split(/\s+/ig);
return keys.reduce((o, k, index) => { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
return keys.reduce((o: Record<string, any>, k, index) => {
o[k] = values[index]; o[k] = values[index];
return o; return o;
}, {}); }, {});
@ -219,14 +241,14 @@ export async function getDiskUsage(path="/crawls", dfOutput = null) {
return rows[0]; return rows[0];
} }
export function calculatePercentageUsed(used, total) { export function calculatePercentageUsed(used: number, total: number) {
return Math.round((used/total) * 100); return Math.round((used/total) * 100);
} }
function checksumFile(hashName, path) { function checksumFile(hashName: string, path: string) : Promise<{hash: string, crc32: number}>{
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
const hash = createHash(hashName); const hash = createHash(hashName);
let crc = null; let crc : number = 0;
const stream = fs.createReadStream(path); const stream = fs.createReadStream(path);
stream.on("error", err => reject(err)); stream.on("error", err => reject(err));
@ -238,7 +260,7 @@ function checksumFile(hashName, path) {
}); });
} }
export function interpolateFilename(filename, crawlId) { export function interpolateFilename(filename: string, crawlId: string) {
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, "")); filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
filename = filename.replace("@hostname", os.hostname()); filename = filename.replace("@hostname", os.hostname());
filename = filename.replace("@hostsuffix", os.hostname().slice(-14)); filename = filename.replace("@hostsuffix", os.hostname().slice(-14));

View file

@ -1,16 +1,21 @@
import { WARCResourceWriter } from "./warcresourcewriter.js"; import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { CDPSession, Protocol } from "puppeteer-core";
// ============================================================================ // ============================================================================
export class BaseTextExtract extends WARCResourceWriter { export abstract class BaseTextExtract extends WARCResourceWriter {
constructor(cdp, opts) { cdp: CDPSession;
lastText: string | null = null;
text: string | null = null;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(cdp: CDPSession, opts: any) {
super({...opts, warcName: "text.warc.gz"}); super({...opts, warcName: "text.warc.gz"});
this.cdp = cdp; this.cdp = cdp;
this.lastText = null;
} }
async extractAndStoreText(resourceType, ignoreIfMatchesLast = false, saveToWarc = false) { async extractAndStoreText(resourceType: string, ignoreIfMatchesLast = false, saveToWarc = false) {
try { try {
const text = await this.doGetText(); const text = await this.doGetText();
@ -26,26 +31,26 @@ export class BaseTextExtract extends WARCResourceWriter {
this.lastText = text; this.lastText = text;
return {changed: true, text}; return {changed: true, text};
} catch (e) { } // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
catch (e: any) {
logger.debug("Error extracting text", e, "text"); logger.debug("Error extracting text", e, "text");
return {changed: false, text: null}; return {changed: false, text: null};
} }
} }
async doGetText() { abstract doGetText() : Promise<string>;
throw new Error("unimplemented");
}
} }
// ============================================================================ // ============================================================================
export class TextExtractViaSnapshot extends BaseTextExtract { export class TextExtractViaSnapshot extends BaseTextExtract {
async doGetText() { async doGetText() : Promise<string> {
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []}); const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
return this.parseTextFromDOMSnapshot(result); return this.parseTextFromDOMSnapshot(result);
} }
parseTextFromDOMSnapshot(result) { parseTextFromDOMSnapshot(result: Protocol.DOMSnapshot.CaptureSnapshotResponse) : string {
const TEXT_NODE = 3; const TEXT_NODE = 3;
const ELEMENT_NODE = 1; const ELEMENT_NODE = 1;
@ -53,13 +58,13 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
const {strings, documents} = result; const {strings, documents} = result;
const accum = []; const accum : string[] = [];
for (const doc of documents) { for (const doc of documents) {
const nodeValues = doc.nodes.nodeValue; const nodeValues = doc.nodes.nodeValue || [];
const nodeNames = doc.nodes.nodeName; const nodeNames = doc.nodes.nodeName || [];
const nodeTypes = doc.nodes.nodeType; const nodeTypes = doc.nodes.nodeType || [];
const parentIndex = doc.nodes.parentIndex; const parentIndex = doc.nodes.parentIndex || [];
for (let i = 0; i < nodeValues.length; i++) { for (let i = 0; i < nodeValues.length; i++) {
if (nodeValues[i] === -1) { if (nodeValues[i] === -1) {
@ -74,28 +79,28 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
if (!SKIPPED_NODES.includes(name)) { if (!SKIPPED_NODES.includes(name)) {
const value = strings[nodeValues[i]].trim(); const value = strings[nodeValues[i]].trim();
if (value) { if (value) {
accum.push(value); accum.push(value as string);
} }
} }
} }
} }
} }
return accum.join("\n");
} }
return accum.join("\n");
} }
} }
// ============================================================================ // ============================================================================
export class TextExtractViaDocument extends BaseTextExtract { export class TextExtractViaDocument extends BaseTextExtract {
async doGetText() { async doGetText() : Promise<string> {
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true}); const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
return this.parseTextFromDOM(result); return this.parseTextFromDOM(result);
} }
async parseTextFromDom(dom) { parseTextFromDOM(dom: Protocol.DOM.GetDocumentResponse) : string {
const accum = []; const accum : string[] = [];
const metadata = {}; const metadata = {};
this.parseText(dom.root, metadata, accum); this.parseText(dom.root, metadata, accum);
@ -103,9 +108,9 @@ export class TextExtractViaDocument extends BaseTextExtract {
return accum.join("\n"); return accum.join("\n");
} }
async parseText(node, metadata, accum) { parseText(node: Protocol.DOM.Node, metadata: Record<string, string> | null, accum: string[]) {
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"]; const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
const EMPTY_LIST = []; const EMPTY_LIST : Protocol.DOM.Node[] = [];
const TEXT = "#text"; const TEXT = "#text";
const TITLE = "title"; const TITLE = "title";
@ -123,9 +128,9 @@ export class TextExtractViaDocument extends BaseTextExtract {
accum.push(value); accum.push(value);
} }
} else if (name === TITLE) { } else if (name === TITLE) {
const title = []; const title : string[] = [];
for (let child of children) { for (const child of children) {
this.parseText(child, null, title); this.parseText(child, null, title);
} }
@ -135,7 +140,7 @@ export class TextExtractViaDocument extends BaseTextExtract {
accum.push(title.join(" ")); accum.push(title.join(" "));
} }
} else { } else {
for (let child of children) { for (const child of children) {
this.parseText(child, metadata, accum); this.parseText(child, metadata, accum);
} }

View file

@ -1,14 +1,24 @@
import { logger } from "./logger.js"; import { logger } from "./logger.js";
export function sleep(seconds) { export function sleep(seconds: number) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000)); return new Promise(resolve => setTimeout(resolve, seconds * 1000));
} }
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) { // TODO: Fix this the next time the file is edited.
export function timedRun(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
promise: Promise<any>,
seconds: number,
message="Promise timed out",
logDetails={},
context="general",
isWarn=false
) {
// return Promise return value or log error if timeout is reached first // return Promise return value or log error if timeout is reached first
const timeout = seconds * 1000; const timeout = seconds * 1000;
const rejectPromiseOnTimeout = (timeout) => { const rejectPromiseOnTimeout = (timeout: number) => {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
setTimeout(() => (reject("timeout reached")), timeout); setTimeout(() => (reject("timeout reached")), timeout);
}); });
@ -26,7 +36,7 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
}); });
} }
export function secondsElapsed(startTime, nowDate = null) { export function secondsElapsed(startTime: number, nowDate: Date | null = null) {
nowDate = nowDate || new Date(); nowDate = nowDate || new Date();
return (nowDate.getTime() - startTime) / 1000; return (nowDate.getTime() - startTime) / 1000;

View file

@ -4,27 +4,35 @@ import * as warcio from "warcio";
export class WARCResourceWriter export class WARCResourceWriter
{ {
constructor({url, directory, date, warcName}) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
page: any;
url: string;
directory: string;
warcName: string;
date: Date;
constructor({url, directory, date, warcName} : {url: string, directory: string, date: Date, warcName: string}) {
this.url = url; this.url = url;
this.directory = directory; this.directory = directory;
this.warcName = path.join(this.directory, warcName); this.warcName = path.join(this.directory, warcName);
this.date = date ? date : new Date(); this.date = date ? date : new Date();
} }
async writeBufferToWARC(contents, resourceType, contentType) { async writeBufferToWARC(contents: Uint8Array, resourceType: string, contentType: string) {
const warcRecord = await this.wrap(contents, resourceType, contentType); const warcRecord = await this.wrap(contents, resourceType, contentType);
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true}); const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
fs.appendFileSync(this.warcName, warcRecordBuffer); fs.appendFileSync(this.warcName, warcRecordBuffer);
} }
async wrap(buffer, resourceType, contentType) { async wrap(buffer: Uint8Array, resourceType: string, contentType: string) {
const warcVersion = "WARC/1.1"; const warcVersion = "WARC/1.1";
const warcRecordType = "resource"; const warcRecordType = "resource";
const warcHeaders = {"Content-Type": contentType}; const warcHeaders = {"Content-Type": contentType};
async function* content() { async function* content() {
yield buffer; yield buffer;
} }
let resourceUrl = `urn:${resourceType}:${this.url}`; const resourceUrl = `urn:${resourceType}:${this.url}`;
return warcio.WARCRecord.create({ return warcio.WARCRecord.create({
url: resourceUrl, url: resourceUrl,

View file

@ -1,15 +1,32 @@
import fs from "fs"; import fs from "fs";
import { Writable } from "stream";
import path from "path"; import path from "path";
import { CDXIndexer } from "warcio"; import { CDXIndexer } from "warcio";
import { WARCSerializer } from "warcio/node"; import { WARCSerializer } from "warcio/node";
import { logger, errJSON } from "./logger.js"; import { logger, errJSON } from "./logger.js";
import type { IndexerOffsetLength, WARCRecord } from "warcio";
// ================================================================= // =================================================================
export class WARCWriter export class WARCWriter implements IndexerOffsetLength
{ {
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails}) { archivesDir: string;
tempCdxDir: string;
filename: string;
gzip: boolean;
logDetails: Record<string, string>;
offset = 0;
recordLength = 0;
indexer?: CDXIndexer;
fh?: Writable | null;
cdxFH?: Writable | null;
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails} :
{archivesDir: string, tempCdxDir: string, filename: string, gzip: boolean, logDetails: Record<string, string>}) {
this.archivesDir = archivesDir; this.archivesDir = archivesDir;
this.tempCdxDir = tempCdxDir; this.tempCdxDir = tempCdxDir;
this.filename = filename; this.filename = filename;
@ -21,12 +38,7 @@ export class WARCWriter
if (this.tempCdxDir) { if (this.tempCdxDir) {
this.indexer = new CDXIndexer({format: "cdxj"}); this.indexer = new CDXIndexer({format: "cdxj"});
} else {
this.indexer = null;
} }
this.fh = null;
this.cdxFH = null;
} }
async initFH() { async initFH() {
@ -38,7 +50,7 @@ export class WARCWriter
} }
} }
async writeRecordPair(responseRecord, requestRecord, responseSerializer = null) { async writeRecordPair(responseRecord: WARCRecord, requestRecord: WARCRecord, responseSerializer: WARCSerializer | undefined = undefined) {
const opts = {gzip: this.gzip}; const opts = {gzip: this.gzip};
if (!responseSerializer) { if (!responseSerializer) {
@ -58,10 +70,14 @@ export class WARCWriter
} }
async _writeRecord(record, serializer) { async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
let total = 0; let total = 0;
const url = record.warcTargetURI; const url = record.warcTargetURI;
if (!this.fh) {
throw new Error("writer not initialized");
}
for await (const chunk of serializer) { for await (const chunk of serializer) {
total += chunk.length; total += chunk.length;
try { try {
@ -74,12 +90,12 @@ export class WARCWriter
return total; return total;
} }
_writeCDX(record) { _writeCDX(record: WARCRecord | null) {
if (this.indexer) { if (this.indexer) {
const cdx = this.indexer.indexRecord(record, this, this.filename); const cdx = this.indexer.indexRecord(record, this, this.filename);
if (this.indexer && this.cdxFH && cdx) { if (this.indexer && this.cdxFH && cdx) {
this.indexer.write(cdx, this.cdxFH); this.indexer.write(cdx, this.cdxFH as NodeJS.WriteStream);
} }
} }
@ -102,8 +118,8 @@ export class WARCWriter
} }
// ================================================================= // =================================================================
export function streamFinish(fh) { export function streamFinish(fh: Writable) {
const p = new Promise(resolve => { const p = new Promise<void>(resolve => {
fh.once("finish", () => resolve()); fh.once("finish", () => resolve());
}); });
fh.end(); fh.end();

View file

@ -6,6 +6,8 @@ import { logger, errJSON } from "./logger.js";
import { sleep, timedRun } from "./timing.js"; import { sleep, timedRun } from "./timing.js";
import { Recorder } from "./recorder.js"; import { Recorder } from "./recorder.js";
import { rxEscape } from "./seeds.js"; import { rxEscape } from "./seeds.js";
import { CDPSession, Page } from "puppeteer-core";
import { PageState, WorkerId } from "./state.js";
const MAX_REUSE = 5; const MAX_REUSE = 5;
@ -14,7 +16,9 @@ const TEARDOWN_TIMEOUT = 10;
const FINISHED_TIMEOUT = 60; const FINISHED_TIMEOUT = 60;
// =========================================================================== // ===========================================================================
export function runWorkers(crawler, numWorkers, maxPageTime, collDir) { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number, collDir: string) {
logger.info(`Creating ${numWorkers} workers`, {}, "worker"); logger.info(`Creating ${numWorkers} workers`, {}, "worker");
const workers = []; const workers = [];
@ -29,40 +33,73 @@ export function runWorkers(crawler, numWorkers, maxPageTime, collDir) {
const rx = new RegExp(rxEscape(process.env.CRAWL_ID) + "\\-([\\d]+)$"); const rx = new RegExp(rxEscape(process.env.CRAWL_ID) + "\\-([\\d]+)$");
const m = os.hostname().match(rx); const m = os.hostname().match(rx);
if (m) { if (m) {
offset = m[1] * numWorkers; offset = Number(m[1]) * numWorkers;
logger.info("Starting workerid index at " + offset, "worker"); logger.info("Starting workerid index at " + offset, "worker");
} }
} }
for (let i = 0; i < numWorkers; i++) { for (let i = 0; i < numWorkers; i++) {
workers.push(new PageWorker(i + offset, crawler, maxPageTime, collDir)); workers.push(new PageWorker((i + offset), crawler, maxPageTime, collDir));
} }
return Promise.allSettled(workers.map((worker) => worker.run())); return Promise.allSettled(workers.map((worker) => worker.run()));
} }
// ===========================================================================
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export type WorkerOpts = Record<string, any> & {
page: Page;
cdp: CDPSession;
workerid: WorkerId;
// eslint-disable-next-line @typescript-eslint/ban-types
callbacks: Record<string, Function>;
directFetchCapture?: ((url: string) => Promise<{fetched: boolean, mime: string}>) | null;
};
// ===========================================================================
export type WorkerState = WorkerOpts & {
data: PageState
};
// =========================================================================== // ===========================================================================
export class PageWorker export class PageWorker
{ {
constructor(id, crawler, maxPageTime, collDir) { id: WorkerId;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
crawler: any;
maxPageTime: number;
reuseCount = 0;
page?: Page | null;
cdp?: CDPSession | null;
// eslint-disable-next-line @typescript-eslint/ban-types
callbacks?: Record<string, Function>;
opts?: WorkerOpts;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
logDetails: Record<string, any> = {};
crashed = false;
markCrashed?: (reason: string) => void;
crashBreak?: Promise<void>;
recorder: Recorder;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(id: WorkerId, crawler: any, maxPageTime: number, collDir: string) {
this.id = id; this.id = id;
this.crawler = crawler; this.crawler = crawler;
this.maxPageTime = maxPageTime; this.maxPageTime = maxPageTime;
this.reuseCount = 0;
this.page = null;
this.cdp = null;
this.callbacks = null;
this.opts = null;
this.logDetails = {workerid: this.id}; this.logDetails = {workerid: this.id};
this.crashed = false;
this.markCrashed = null;
this.crashBreak = null;
this.recorder = new Recorder({workerid: id, collDir, crawler: this.crawler}); this.recorder = new Recorder({workerid: id, collDir, crawler: this.crawler});
this.crawler.browser.recorders.push(this.recorder); this.crawler.browser.recorders.push(this.recorder);
@ -108,9 +145,9 @@ export class PageWorker
} }
} }
isSameOrigin(url) { isSameOrigin(url: string) {
try { try {
const currURL = new URL(this.page.url()); const currURL = new URL(this.page ? this.page.url() : "");
const newURL = new URL(url); const newURL = new URL(url);
return currURL.origin === newURL.origin; return currURL.origin === newURL.origin;
} catch (e) { } catch (e) {
@ -118,8 +155,8 @@ export class PageWorker
} }
} }
async initPage(url) { async initPage(url: string) : Promise<WorkerOpts> {
if (!this.crashed && this.page && ++this.reuseCount <= MAX_REUSE && this.isSameOrigin(url)) { if (!this.crashed && this.page && this.opts && ++this.reuseCount <= MAX_REUSE && this.isSameOrigin(url)) {
logger.debug("Reusing page", {reuseCount: this.reuseCount, ...this.logDetails}, "worker"); logger.debug("Reusing page", {reuseCount: this.reuseCount, ...this.logDetails}, "worker");
return this.opts; return this.opts;
} else if (this.page) { } else if (this.page) {
@ -151,10 +188,10 @@ export class PageWorker
this.page = page; this.page = page;
this.cdp = cdp; this.cdp = cdp;
this.callbacks = {}; this.callbacks = {};
const directFetchCapture = this.recorder ? (x) => this.recorder.directFetchCapture(x) : null; const directFetchCapture = this.recorder ? (x: string) => this.recorder.directFetchCapture(x) : null;
this.opts = { this.opts = {
page: this.page, page,
cdp: this.cdp, cdp,
workerid, workerid,
callbacks: this.callbacks, callbacks: this.callbacks,
directFetchCapture, directFetchCapture,
@ -168,15 +205,19 @@ export class PageWorker
this.crashed = false; this.crashed = false;
this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject); this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject);
this.logDetails = {page: this.page.url(), workerid}; this.logDetails = {page: page.url(), workerid};
// more serious page crash, mark as failed // more serious page crash, mark as failed
this.page.on("error", (err) => { // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
page.on("error", (err: any) => {
// ensure we're still on this page, otherwise ignore! // ensure we're still on this page, otherwise ignore!
if (this.page === page) { if (this.page === page) {
logger.error("Page Crashed", {...errJSON(err), ...this.logDetails}, "worker"); logger.error("Page Crashed", {...errJSON(err), ...this.logDetails}, "worker");
this.crashed = true; this.crashed = true;
this.markCrashed("crashed"); if (this.markCrashed) {
this.markCrashed("crashed");
}
} }
}); });
@ -204,9 +245,11 @@ export class PageWorker
} }
} }
} }
throw new Error("no page available, shouldn't get here");
} }
async crawlPage(opts) { async crawlPage(opts: WorkerState) {
const res = await this.crawler.crawlPage(opts); const res = await this.crawler.crawlPage(opts);
if (this.recorder) { if (this.recorder) {
await this.recorder.finishPage(); await this.recorder.finishPage();
@ -214,7 +257,7 @@ export class PageWorker
return res; return res;
} }
async timedCrawlPage(opts) { async timedCrawlPage(opts: WorkerState) {
const workerid = this.id; const workerid = this.id;
const { data } = opts; const { data } = opts;
const { url } = data; const { url } = data;
@ -244,7 +287,7 @@ export class PageWorker
]); ]);
} catch (e) { } catch (e) {
if (e.message !== "logged" && !this.crashed) { if (e instanceof Error && e.message !== "logged" && !this.crashed) {
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker"); logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
} }
} finally { } finally {
@ -317,7 +360,7 @@ export class PageWorker
await sleep(0.5); await sleep(0.5);
} else { } else {
// if no pending and queue size is still empty, we're done! // if no pending and queue size is still empty, we're done!
if (!await crawlState.queueSize()) { if (!(await crawlState.queueSize())) {
break; break;
} }
} }

View file

@ -1,3 +1,4 @@
/* eslint-disable @typescript-eslint/no-unused-vars */
class TestBehavior2 class TestBehavior2
{ {
static init() { static init() {

View file

@ -1,3 +1,4 @@
/* eslint-disable @typescript-eslint/no-unused-vars */
class TestBehavior class TestBehavior
{ {
static init() { static init() {

View file

@ -1,4 +1,4 @@
import { parseArgs } from "../util/argParser.js"; import { parseArgs } from "../dist/util/argParser.js";
import fs from "fs"; import fs from "fs";

View file

@ -1,4 +1,4 @@
import { calculatePercentageUsed, checkDiskUtilization } from "../util/storage.js"; import { calculatePercentageUsed, checkDiskUtilization } from "../dist/util/storage.js";
test("ensure calculatePercentageUsed returns expected values", () => { test("ensure calculatePercentageUsed returns expected values", () => {

107
tsconfig.json Normal file
View file

@ -0,0 +1,107 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig to read more about this file */
/* Projects */
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Language and Environment */
"target": "es2022", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
"lib": ["es2022", "dom", "dom.iterable"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */
// "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */
"module": "NodeNext", /* Specify what module code is generated. */
"rootDir": "./src", /* Specify the root folder within your source files. */
"moduleResolution": "NodeNext", /* Specify how TypeScript looks up a file from a given module specifier. */
//"baseUrl": "./src", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
// "resolveJsonModule": true, /* Enable importing .json files. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
/* JavaScript Support */
"allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
"checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
/* Emit */
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
"outDir": "./dist/", /* Specify an output folder for all emitted files. */
// "removeComments": true, /* Disable emitting comments. */
// "noEmit": true, /* Disable emitting files from a compilation. */
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
// "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
// "newLine": "crlf", /* Set the newline character for emitting files. */
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
// "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
/* Interop Constraints */
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
//"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
/* Type Checking */
"strict": true, /* Enable all strict type-checking options. */
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
/* Completeness */
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
},
"include": [
"src/**/*",
]
}

893
yarn.lock

File diff suppressed because it is too large Load diff