mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
TypeScript Conversion (#425)
Follows #424. Converts the upcoming 1.0.0 branch based on native browser-based traffic capture and recording to TypeScript. Fixes #426 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> Co-authored-by: emma <hi@emma.cafe>
This commit is contained in:
parent
877d9f5b44
commit
af1e0860e4
36 changed files with 2446 additions and 1406 deletions
|
@ -1,39 +1,32 @@
|
||||||
module.exports = {
|
module.exports = {
|
||||||
"env": {
|
env: {
|
||||||
"browser": true,
|
browser: true,
|
||||||
"es2021": true,
|
es2021: true,
|
||||||
"node": true,
|
node: true,
|
||||||
"jest": true
|
jest: true,
|
||||||
},
|
},
|
||||||
"extends": "eslint:recommended",
|
extends: ["eslint:recommended", "plugin:@typescript-eslint/recommended"],
|
||||||
"parserOptions": {
|
parser: "@typescript-eslint/parser",
|
||||||
"ecmaVersion": 12,
|
plugins: ["@typescript-eslint"],
|
||||||
"sourceType": "module"
|
parserOptions: {
|
||||||
},
|
ecmaVersion: 12,
|
||||||
"rules": {
|
sourceType: "module",
|
||||||
"indent": [
|
},
|
||||||
"error",
|
rules: {
|
||||||
2
|
indent: ["error", 2],
|
||||||
],
|
"linebreak-style": ["error", "unix"],
|
||||||
"linebreak-style": [
|
quotes: ["error", "double"],
|
||||||
"error",
|
semi: ["error", "always"],
|
||||||
"unix"
|
"no-constant-condition": ["error", { checkLoops: false }],
|
||||||
],
|
"no-use-before-define": [
|
||||||
"quotes": [
|
"error",
|
||||||
"error",
|
{
|
||||||
"double"
|
variables: true,
|
||||||
],
|
functions: false,
|
||||||
"semi": [
|
classes: false,
|
||||||
"error",
|
allowNamedExports: true,
|
||||||
"always"
|
},
|
||||||
],
|
],
|
||||||
"no-constant-condition": [
|
},
|
||||||
"error",
|
reportUnusedDisableDirectives: true,
|
||||||
{"checkLoops": false }
|
|
||||||
],
|
|
||||||
"no-use-before-define": [
|
|
||||||
"error",
|
|
||||||
{"variables": true, "functions": false, "classes": false, "allowNamedExports": true}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
2
.github/workflows/ci.yaml
vendored
2
.github/workflows/ci.yaml
vendored
|
@ -40,6 +40,8 @@ jobs:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
- name: install requirements
|
- name: install requirements
|
||||||
run: yarn install
|
run: yarn install
|
||||||
|
- name: build js
|
||||||
|
run: yarn run tsc
|
||||||
- name: build docker
|
- name: build docker
|
||||||
run: docker-compose build
|
run: docker-compose build
|
||||||
- name: run jest
|
- name: run jest
|
||||||
|
|
10
Dockerfile
10
Dockerfile
|
@ -38,14 +38,18 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \
|
||||||
|
|
||||||
RUN yarn install --network-timeout 1000000
|
RUN yarn install --network-timeout 1000000
|
||||||
|
|
||||||
ADD *.js /app/
|
ADD tsconfig.json /app/
|
||||||
ADD util/*.js /app/util/
|
ADD src /app/src
|
||||||
|
|
||||||
|
RUN yarn run tsc
|
||||||
|
|
||||||
ADD config/ /app/
|
ADD config/ /app/
|
||||||
|
|
||||||
ADD html/ /app/html/
|
ADD html/ /app/html/
|
||||||
|
|
||||||
RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/bin/create-login-profile
|
RUN chmod u+x /app/dist/main.js /app/dist/create-login-profile.js
|
||||||
|
|
||||||
|
RUN ln -s /app/dist/main.js /usr/bin/crawl; ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
|
||||||
|
|
||||||
WORKDIR /crawls
|
WORKDIR /crawls
|
||||||
|
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
|
|
||||||
export default async ({data, page, crawler}) => {
|
|
||||||
await crawler.loadPage(page, data);
|
|
||||||
};
|
|
23
package.json
23
package.json
|
@ -7,7 +7,8 @@
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
"license": "AGPL-3.0-or-later",
|
"license": "AGPL-3.0-or-later",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"lint": "eslint *.js util/*.js tests/*.test.js",
|
"tsc": "tsc",
|
||||||
|
"lint": "eslint *.js tests/*.test.js",
|
||||||
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
|
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
|
||||||
"prepare": "husky install"
|
"prepare": "husky install"
|
||||||
},
|
},
|
||||||
|
@ -18,23 +19,31 @@
|
||||||
"crc": "^4.3.2",
|
"crc": "^4.3.2",
|
||||||
"get-folder-size": "^4.0.0",
|
"get-folder-size": "^4.0.0",
|
||||||
"husky": "^8.0.3",
|
"husky": "^8.0.3",
|
||||||
"ioredis": "^4.27.1",
|
"ioredis": "^5.3.2",
|
||||||
"js-yaml": "^4.1.0",
|
"js-yaml": "^4.1.0",
|
||||||
"minio": "7.0.26",
|
"minio": "^7.1.3",
|
||||||
"p-queue": "^7.3.4",
|
"p-queue": "^7.3.4",
|
||||||
"puppeteer-core": "^20.7.4",
|
"puppeteer-core": "^20.7.4",
|
||||||
"sharp": "^0.32.1",
|
"sharp": "^0.32.1",
|
||||||
"sitemapper": "^3.2.5",
|
"sitemapper": "^3.2.6",
|
||||||
|
"tsc": "^2.0.4",
|
||||||
"uuid": "8.3.2",
|
"uuid": "8.3.2",
|
||||||
"warcio": "^2.2.0",
|
"warcio": "^2.2.1",
|
||||||
"ws": "^7.4.4",
|
"ws": "^7.4.4",
|
||||||
"yargs": "^17.7.2"
|
"yargs": "^17.7.2"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"eslint": "^8.37.0",
|
"@types/js-yaml": "^4.0.8",
|
||||||
|
"@types/node": "^20.8.7",
|
||||||
|
"@types/uuid": "^9.0.6",
|
||||||
|
"@types/ws": "^8.5.8",
|
||||||
|
"@typescript-eslint/eslint-plugin": "^6.10.0",
|
||||||
|
"@typescript-eslint/parser": "^6.10.0",
|
||||||
|
"eslint": "^8.53.0",
|
||||||
"eslint-plugin-react": "^7.22.0",
|
"eslint-plugin-react": "^7.22.0",
|
||||||
"jest": "^29.2.1",
|
"jest": "^29.2.1",
|
||||||
"md5": "^2.3.0"
|
"md5": "^2.3.0",
|
||||||
|
"typescript": "^5.2.2"
|
||||||
},
|
},
|
||||||
"jest": {
|
"jest": {
|
||||||
"transform": {},
|
"transform": {},
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -2,24 +2,25 @@
|
||||||
|
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import http from "http";
|
import http, { IncomingMessage, ServerResponse } from "http";
|
||||||
|
|
||||||
import readline from "readline";
|
import readline from "readline";
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
|
||||||
import yargs from "yargs";
|
import yargs, { Options } from "yargs";
|
||||||
|
|
||||||
import { logger } from "./util/logger.js";
|
import { logger } from "./util/logger.js";
|
||||||
|
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
import { initStorage } from "./util/storage.js";
|
import { initStorage } from "./util/storage.js";
|
||||||
|
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||||
|
|
||||||
const profileHTML = fs.readFileSync(new URL("html/createProfile.html", import.meta.url), {encoding: "utf8"});
|
const profileHTML = fs.readFileSync(new URL("../html/createProfile.html", import.meta.url), {encoding: "utf8"});
|
||||||
const vncHTML = fs.readFileSync(new URL("html/vnc_lite.html", import.meta.url), {encoding: "utf8"});
|
const vncHTML = fs.readFileSync(new URL("../html/vnc_lite.html", import.meta.url), {encoding: "utf8"});
|
||||||
|
|
||||||
const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
||||||
|
|
||||||
function cliOpts() {
|
function cliOpts(): { [key: string]: Options } {
|
||||||
return {
|
return {
|
||||||
"url": {
|
"url": {
|
||||||
describe: "The URL of the login page",
|
describe: "The URL of the login page",
|
||||||
|
@ -93,7 +94,7 @@ function cliOpts() {
|
||||||
}
|
}
|
||||||
|
|
||||||
function getDefaultWindowSize() {
|
function getDefaultWindowSize() {
|
||||||
const values = process.env.GEOMETRY.split("x");
|
const values = (process.env.GEOMETRY || "").split("x");
|
||||||
const x = Number(values[0]);
|
const x = Number(values[0]);
|
||||||
const y = Number(values[1]);
|
const y = Number(values[1]);
|
||||||
return `${x},${y}`;
|
return `${x},${y}`;
|
||||||
|
@ -102,23 +103,23 @@ function getDefaultWindowSize() {
|
||||||
|
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
const params = yargs(process.argv)
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const params : any = yargs(process.argv)
|
||||||
.usage("browsertrix-crawler profile [options]")
|
.usage("browsertrix-crawler profile [options]")
|
||||||
.option(cliOpts())
|
.option(cliOpts())
|
||||||
.argv;
|
.argv;
|
||||||
|
|
||||||
logger.setDebugLogging(true);
|
logger.setDebugLogging(true);
|
||||||
|
|
||||||
|
|
||||||
if (!params.headless) {
|
if (!params.headless) {
|
||||||
logger.debug("Launching XVFB");
|
logger.debug("Launching XVFB");
|
||||||
child_process.spawn("Xvfb", [
|
child_process.spawn("Xvfb", [
|
||||||
process.env.DISPLAY,
|
process.env.DISPLAY || "",
|
||||||
"-listen",
|
"-listen",
|
||||||
"tcp",
|
"tcp",
|
||||||
"-screen",
|
"-screen",
|
||||||
"0",
|
"0",
|
||||||
process.env.GEOMETRY,
|
process.env.GEOMETRY || "",
|
||||||
"-ac",
|
"-ac",
|
||||||
"+extension",
|
"+extension",
|
||||||
"RANDR"
|
"RANDR"
|
||||||
|
@ -137,9 +138,9 @@ async function main() {
|
||||||
"-rfbport",
|
"-rfbport",
|
||||||
"6080",
|
"6080",
|
||||||
"-passwd",
|
"-passwd",
|
||||||
process.env.VNC_PASS,
|
process.env.VNC_PASS || "",
|
||||||
"-display",
|
"-display",
|
||||||
process.env.DISPLAY
|
process.env.DISPLAY || ""
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -178,7 +179,7 @@ async function main() {
|
||||||
|
|
||||||
const { page, cdp } = await browser.newWindowPageWithCDP();
|
const { page, cdp } = await browser.newWindowPageWithCDP();
|
||||||
|
|
||||||
const waitUntil = "load";
|
const waitUntil : PuppeteerLifeCycleEvent = "load";
|
||||||
|
|
||||||
await page.setCacheEnabled(false);
|
await page.setCacheEnabled(false);
|
||||||
|
|
||||||
|
@ -203,7 +204,9 @@ async function main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function automatedProfile(params, browser, page, cdp, waitUntil) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async function automatedProfile(params: any, browser: Browser, page: Page, cdp: CDPSession,
|
||||||
|
waitUntil: PuppeteerLifeCycleEvent) {
|
||||||
let u, p;
|
let u, p;
|
||||||
|
|
||||||
logger.debug("Looking for username and password entry fields on page...");
|
logger.debug("Looking for username and password entry fields on page...");
|
||||||
|
@ -222,12 +225,12 @@ async function automatedProfile(params, browser, page, cdp, waitUntil) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
await u.type(params.user);
|
await u!.type(params.user);
|
||||||
|
|
||||||
await p.type(params.password);
|
await p!.type(params.password);
|
||||||
|
|
||||||
await Promise.allSettled([
|
await Promise.allSettled([
|
||||||
p.press("Enter"),
|
p!.press("Enter"),
|
||||||
page.waitForNavigation({waitUntil})
|
page.waitForNavigation({waitUntil})
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
@ -240,7 +243,8 @@ async function automatedProfile(params, browser, page, cdp, waitUntil) {
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function createProfile(params, browser, page, cdp, targetFilename = "") {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async function createProfile(params: any, browser: Browser, page: Page, cdp: CDPSession, targetFilename = "") {
|
||||||
await cdp.send("Network.clearBrowserCache");
|
await cdp.send("Network.clearBrowserCache");
|
||||||
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
|
@ -268,8 +272,9 @@ async function createProfile(params, browser, page, cdp, targetFilename = "") {
|
||||||
return resource;
|
return resource;
|
||||||
}
|
}
|
||||||
|
|
||||||
function promptInput(msg, hidden = false) {
|
function promptInput(msg: string, hidden = false) {
|
||||||
const rl = readline.createInterface({
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const rl : any = readline.createInterface({
|
||||||
input: process.stdin,
|
input: process.stdin,
|
||||||
output: process.stdout
|
output: process.stdout
|
||||||
});
|
});
|
||||||
|
@ -290,8 +295,8 @@ function promptInput(msg, hidden = false) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Promise((resolve) => {
|
return new Promise<string>((resolve) => {
|
||||||
rl.question(msg, function (res) {
|
rl.question(msg, function (res: string) {
|
||||||
rl.close();
|
rl.close();
|
||||||
resolve(res);
|
resolve(res);
|
||||||
});
|
});
|
||||||
|
@ -300,9 +305,31 @@ function promptInput(msg, hidden = false) {
|
||||||
|
|
||||||
|
|
||||||
class InteractiveBrowser {
|
class InteractiveBrowser {
|
||||||
constructor(params, browser, page, cdp, targetId) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
params: any;
|
||||||
|
browser: Browser;
|
||||||
|
page: Page;
|
||||||
|
cdp: CDPSession;
|
||||||
|
|
||||||
|
targetId: string;
|
||||||
|
originSet = new Set<string>();
|
||||||
|
|
||||||
|
shutdownWait: number;
|
||||||
|
shutdownTimer: NodeJS.Timer | null;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
params: any,
|
||||||
|
browser: Browser,
|
||||||
|
page: Page,
|
||||||
|
cdp: CDPSession,
|
||||||
|
targetId: string
|
||||||
|
) {
|
||||||
logger.info("Creating Profile Interactively...");
|
logger.info("Creating Profile Interactively...");
|
||||||
child_process.spawn("socat", ["tcp-listen:9222,reuseaddr,fork", "tcp:localhost:9221"]);
|
child_process.spawn("socat", [
|
||||||
|
"tcp-listen:9222,reuseaddr,fork",
|
||||||
|
"tcp:localhost:9221",
|
||||||
|
]);
|
||||||
|
|
||||||
this.params = params;
|
this.params = params;
|
||||||
this.browser = browser;
|
this.browser = browser;
|
||||||
|
@ -311,8 +338,6 @@ class InteractiveBrowser {
|
||||||
|
|
||||||
this.targetId = targetId;
|
this.targetId = targetId;
|
||||||
|
|
||||||
this.originSet = new Set();
|
|
||||||
|
|
||||||
this.addOrigin();
|
this.addOrigin();
|
||||||
|
|
||||||
page.on("load", () => this.handlePageLoad());
|
page.on("load", () => this.handlePageLoad());
|
||||||
|
@ -323,25 +348,31 @@ class InteractiveBrowser {
|
||||||
|
|
||||||
cdp.on("Page.windowOpen", async (resp) => {
|
cdp.on("Page.windowOpen", async (resp) => {
|
||||||
if (resp.url) {
|
if (resp.url) {
|
||||||
await cdp.send("Target.activateTarget", {targetId: this.targetId});
|
await cdp.send("Target.activateTarget", { targetId: this.targetId });
|
||||||
await page.goto(resp.url);
|
await page.goto(resp.url);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
this.shutdownWait = params.shutdownWait * 1000;
|
this.shutdownWait = params.shutdownWait * 1000;
|
||||||
|
|
||||||
if (this.shutdownWait) {
|
if (this.shutdownWait) {
|
||||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||||
logger.debug(`Shutting down in ${this.shutdownWait}ms if no ping received`);
|
logger.debug(
|
||||||
|
`Shutting down in ${this.shutdownWait}ms if no ping received`
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
this.shutdownTimer = 0;
|
this.shutdownTimer = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const httpServer = http.createServer((req, res) => this.handleRequest(req, res));
|
const httpServer = http.createServer((req, res) =>
|
||||||
|
this.handleRequest(req, res)
|
||||||
|
);
|
||||||
const port = 9223;
|
const port = 9223;
|
||||||
httpServer.listen(port);
|
httpServer.listen(port);
|
||||||
logger.info(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
|
logger.info(
|
||||||
|
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`
|
||||||
|
);
|
||||||
|
|
||||||
if (!params.headless) {
|
if (!params.headless) {
|
||||||
logger.info("Screencasting with VNC on port 6080");
|
logger.info("Screencasting with VNC on port 6080");
|
||||||
|
@ -363,18 +394,26 @@ class InteractiveBrowser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async saveCookiesFor(url) {
|
async saveCookiesFor(url: string) {
|
||||||
try {
|
try {
|
||||||
if (this.params.cookieDays <= 0) {
|
if (this.params.cookieDays <= 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const cookies = await this.browser.getCookies(this.page, url);
|
const cookies = await this.browser.getCookies(this.page);
|
||||||
for (const cookie of cookies) {
|
for (const cookieOrig of cookies) {
|
||||||
cookie.expires = (new Date().getTime() / 1000) + this.params.cookieDays * 86400;
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const cookie = cookieOrig as any;
|
||||||
|
cookie.expires =
|
||||||
|
new Date().getTime() / 1000 + this.params.cookieDays * 86400;
|
||||||
|
|
||||||
delete cookie.size;
|
delete cookie.size;
|
||||||
delete cookie.session;
|
delete cookie.session;
|
||||||
if (cookie.sameSite && cookie.sameSite !== "Lax" && cookie.sameSite !== "Strict") {
|
if (
|
||||||
|
cookie.sameSite &&
|
||||||
|
cookie.sameSite !== "Lax" &&
|
||||||
|
cookie.sameSite !== "Strict"
|
||||||
|
) {
|
||||||
delete cookie.sameSite;
|
delete cookie.sameSite;
|
||||||
}
|
}
|
||||||
if (!cookie.domain && !cookie.path) {
|
if (!cookie.domain && !cookie.path) {
|
||||||
|
@ -382,64 +421,76 @@ class InteractiveBrowser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
await this.browser.setCookies(this.page, cookies);
|
await this.browser.setCookies(this.page, cookies);
|
||||||
} catch (e) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
} catch (e: any) {
|
||||||
logger.error("Save Cookie Error: ", e);
|
logger.error("Save Cookie Error: ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addOrigin() {
|
addOrigin() {
|
||||||
const url = this.page.url();
|
const url = this.page.url();
|
||||||
logger.debug("Adding origin", {url});
|
logger.debug("Adding origin", { url });
|
||||||
if (url.startsWith("http:") || url.startsWith("https:")) {
|
if (url.startsWith("http:") || url.startsWith("https:")) {
|
||||||
this.originSet.add(new URL(url).origin);
|
this.originSet.add(new URL(url).origin);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async handleRequest(req, res) {
|
async handleRequest(req: IncomingMessage, res: ServerResponse) {
|
||||||
const parsedUrl = new URL(req.url, `http://${req.headers.host}`);
|
const parsedUrl = new URL(req.url || "", `http://${req.headers.host}`);
|
||||||
const pathname = parsedUrl.pathname;
|
const pathname = parsedUrl.pathname;
|
||||||
let targetUrl;
|
let targetUrl;
|
||||||
let origins;
|
let origins;
|
||||||
|
|
||||||
switch (pathname) {
|
switch (pathname) {
|
||||||
case "/":
|
case "/":
|
||||||
res.writeHead(200, {"Content-Type": "text/html"});
|
res.writeHead(200, { "Content-Type": "text/html" });
|
||||||
if (this.params.headless) {
|
if (this.params.headless) {
|
||||||
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
|
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
|
||||||
} else {
|
} else {
|
||||||
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
|
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
|
||||||
}
|
}
|
||||||
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname)));
|
res.end(
|
||||||
|
profileHTML.replace(
|
||||||
|
"$DEVTOOLS_SRC",
|
||||||
|
targetUrl.replaceAll("$HOST", parsedUrl.hostname)
|
||||||
|
)
|
||||||
|
);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
case "/vnc/":
|
case "/vnc/":
|
||||||
case "/vnc/index.html":
|
case "/vnc/index.html":
|
||||||
res.writeHead(200, {"Content-Type": "text/html"});
|
res.writeHead(200, { "Content-Type": "text/html" });
|
||||||
res.end(vncHTML);
|
res.end(vncHTML);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
case "/ping":
|
case "/ping":
|
||||||
if (this.shutdownWait) {
|
if (this.shutdownWait) {
|
||||||
clearInterval(this.shutdownTimer);
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
clearTimeout(this.shutdownTimer as any);
|
||||||
logger.debug(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
|
this.shutdownTimer = setTimeout(
|
||||||
|
() => process.exit(0),
|
||||||
|
this.shutdownWait
|
||||||
|
);
|
||||||
|
logger.debug(
|
||||||
|
`Ping received, delaying shutdown for ${this.shutdownWait}ms`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
origins = Array.from(this.originSet.values());
|
origins = Array.from(this.originSet.values());
|
||||||
|
|
||||||
res.writeHead(200, {"Content-Type": "application/json"});
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
|
|
||||||
res.end(JSON.stringify({pong: true, origins}));
|
res.end(JSON.stringify({ pong: true, origins }));
|
||||||
return;
|
return;
|
||||||
|
|
||||||
case "/target":
|
case "/target":
|
||||||
res.writeHead(200, {"Content-Type": "application/json"});
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
res.end(JSON.stringify({targetId: this.targetId}));
|
res.end(JSON.stringify({ targetId: this.targetId }));
|
||||||
return;
|
return;
|
||||||
|
|
||||||
case "/vncpass":
|
case "/vncpass":
|
||||||
res.writeHead(200, {"Content-Type": "application/json"});
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
res.end(JSON.stringify({password: process.env.VNC_PASS}));
|
res.end(JSON.stringify({ password: process.env.VNC_PASS }));
|
||||||
return;
|
return;
|
||||||
|
|
||||||
case "/navigate":
|
case "/navigate":
|
||||||
|
@ -451,14 +502,14 @@ class InteractiveBrowser {
|
||||||
const postData = await this.readBodyJson(req);
|
const postData = await this.readBodyJson(req);
|
||||||
const url = new URL(postData.url).href;
|
const url = new URL(postData.url).href;
|
||||||
|
|
||||||
res.writeHead(200, {"Content-Type": "application/json"});
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
res.end(JSON.stringify({success: true}));
|
res.end(JSON.stringify({ success: true }));
|
||||||
|
|
||||||
this.page.goto(url);
|
this.page.goto(url);
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e) {
|
} catch (e: any) {
|
||||||
res.writeHead(400, {"Content-Type": "application/json"});
|
res.writeHead(400, { "Content-Type": "application/json" });
|
||||||
res.end(JSON.stringify({"error": e.toString()}));
|
res.end(JSON.stringify({ error: e.toString() }));
|
||||||
logger.warn("HTTP Error", e);
|
logger.warn("HTTP Error", e);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
|
@ -474,14 +525,21 @@ class InteractiveBrowser {
|
||||||
|
|
||||||
await this.saveAllCookies();
|
await this.saveAllCookies();
|
||||||
|
|
||||||
const resource = await createProfile(this.params, this.browser, this.page, this.cdp, targetFilename);
|
const resource = await createProfile(
|
||||||
|
this.params,
|
||||||
|
this.browser,
|
||||||
|
this.page,
|
||||||
|
this.cdp,
|
||||||
|
targetFilename
|
||||||
|
);
|
||||||
origins = Array.from(this.originSet.values());
|
origins = Array.from(this.originSet.values());
|
||||||
|
|
||||||
res.writeHead(200, {"Content-Type": "application/json"});
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
res.end(JSON.stringify({resource, origins}));
|
res.end(JSON.stringify({ resource, origins }));
|
||||||
} catch (e) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
res.writeHead(500, {"Content-Type": "application/json"});
|
} catch (e: any) {
|
||||||
res.end(JSON.stringify({"error": e.toString()}));
|
res.writeHead(500, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify({ error: e.toString() }));
|
||||||
logger.warn("HTTP Error", e);
|
logger.warn("HTTP Error", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -498,11 +556,16 @@ class InteractiveBrowser {
|
||||||
|
|
||||||
await createProfile(this.params, this.browser, this.page, this.cdp);
|
await createProfile(this.params, this.browser, this.page, this.cdp);
|
||||||
|
|
||||||
res.writeHead(200, {"Content-Type": "text/html"});
|
res.writeHead(200, { "Content-Type": "text/html" });
|
||||||
res.end("<html><body>Profile Created! You may now close this window.</body></html>");
|
res.end(
|
||||||
} catch (e) {
|
"<html><body>Profile Created! You may now close this window.</body></html>"
|
||||||
res.writeHead(500, {"Content-Type": "text/html"});
|
);
|
||||||
res.end("<html><body>Profile creation failed! See the browsertrix-crawler console for more info");
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
} catch (e: any) {
|
||||||
|
res.writeHead(500, { "Content-Type": "text/html" });
|
||||||
|
res.end(
|
||||||
|
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info"
|
||||||
|
);
|
||||||
logger.warn("HTTP Error", e);
|
logger.warn("HTTP Error", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -511,18 +574,21 @@ class InteractiveBrowser {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pathname.startsWith("/vnc/")) {
|
if (pathname.startsWith("/vnc/")) {
|
||||||
const fileUrl = new URL("node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length), import.meta.url);
|
const fileUrl = new URL(
|
||||||
const file = fs.readFileSync(fileUrl, {encoding: "utf-8"});
|
"../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length),
|
||||||
res.writeHead(200, {"Content-Type": "application/javascript"});
|
import.meta.url
|
||||||
|
);
|
||||||
|
const file = fs.readFileSync(fileUrl, { encoding: "utf-8" });
|
||||||
|
res.writeHead(200, { "Content-Type": "application/javascript" });
|
||||||
res.end(file);
|
res.end(file);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
res.writeHead(404, {"Content-Type": "text/html"});
|
res.writeHead(404, { "Content-Type": "text/html" });
|
||||||
res.end("Not Found");
|
res.end("Not Found");
|
||||||
}
|
}
|
||||||
|
|
||||||
async readBodyJson(req) {
|
async readBodyJson(req: IncomingMessage) {
|
||||||
const buffers = [];
|
const buffers = [];
|
||||||
|
|
||||||
for await (const chunk of req) {
|
for await (const chunk of req) {
|
7
src/defaultDriver.ts
Normal file
7
src/defaultDriver.ts
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
import { Page } from "puppeteer-core";
|
||||||
|
import { PageState } from "./util/state.js";
|
||||||
|
import { Crawler } from "./crawler.js";
|
||||||
|
|
||||||
|
export default async ({data, page, crawler} : {data: PageState, page: Page, crawler: Crawler}) => {
|
||||||
|
await crawler.loadPage(page, data);
|
||||||
|
};
|
|
@ -5,13 +5,13 @@ import { setExitOnRedisError } from "./util/redis.js";
|
||||||
import { Crawler } from "./crawler.js";
|
import { Crawler } from "./crawler.js";
|
||||||
|
|
||||||
|
|
||||||
var crawler = null;
|
let crawler : Crawler | null = null;
|
||||||
|
|
||||||
var lastSigInt = 0;
|
let lastSigInt = 0;
|
||||||
let forceTerm = false;
|
let forceTerm = false;
|
||||||
|
|
||||||
|
|
||||||
async function handleTerminate(signame) {
|
async function handleTerminate(signame: string) {
|
||||||
logger.info(`${signame} received...`);
|
logger.info(`${signame} received...`);
|
||||||
if (!crawler || !crawler.crawlState) {
|
if (!crawler || !crawler.crawlState) {
|
||||||
logger.error("error: no crawler running, exiting");
|
logger.error("error: no crawler running, exiting");
|
||||||
|
@ -23,7 +23,7 @@ async function handleTerminate(signame) {
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
setExitOnRedisError(true);
|
setExitOnRedisError();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
crawler.checkCanceled();
|
crawler.checkCanceled();
|
||||||
|
@ -31,13 +31,13 @@ async function handleTerminate(signame) {
|
||||||
if (!crawler.interrupted) {
|
if (!crawler.interrupted) {
|
||||||
logger.info("SIGNAL: gracefully finishing current pages...");
|
logger.info("SIGNAL: gracefully finishing current pages...");
|
||||||
crawler.gracefulFinishOnInterrupt();
|
crawler.gracefulFinishOnInterrupt();
|
||||||
|
} else if (forceTerm || Date.now() - lastSigInt > 200) {
|
||||||
} else if (forceTerm || (Date.now() - lastSigInt) > 200) {
|
|
||||||
logger.info("SIGNAL: stopping crawl now...");
|
logger.info("SIGNAL: stopping crawl now...");
|
||||||
await crawler.serializeAndExit();
|
await crawler.serializeAndExit();
|
||||||
}
|
}
|
||||||
lastSigInt = Date.now();
|
lastSigInt = Date.now();
|
||||||
} catch (e) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
} catch (e: any) {
|
||||||
logger.error("Error stopping crawl after receiving termination signal", e);
|
logger.error("Error stopping crawl after receiving termination signal", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -4,7 +4,7 @@ import os from "os";
|
||||||
|
|
||||||
import yaml from "js-yaml";
|
import yaml from "js-yaml";
|
||||||
import { KnownDevices as devices } from "puppeteer-core";
|
import { KnownDevices as devices } from "puppeteer-core";
|
||||||
import yargs from "yargs";
|
import yargs, { Options } from "yargs";
|
||||||
import { hideBin } from "yargs/helpers";
|
import { hideBin } from "yargs/helpers";
|
||||||
|
|
||||||
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
|
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
|
||||||
|
@ -16,8 +16,8 @@ import { logger } from "./logger.js";
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
class ArgParser {
|
class ArgParser {
|
||||||
get cliOpts() {
|
get cliOpts() : { [key: string]: Options } {
|
||||||
const coerce = array => {
|
const coerce = (array : string[]) => {
|
||||||
return array.flatMap(v => v.split(",")).filter(x => !!x);
|
return array.flatMap(v => v.split(",")).filter(x => !!x);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -305,7 +305,7 @@ class ArgParser {
|
||||||
"warcInfo": {
|
"warcInfo": {
|
||||||
alias: ["warcinfo"],
|
alias: ["warcinfo"],
|
||||||
describe: "Optional fields added to the warcinfo record in combined WARCs",
|
describe: "Optional fields added to the warcinfo record in combined WARCs",
|
||||||
type: "object"
|
//type: "object"
|
||||||
},
|
},
|
||||||
|
|
||||||
"redisStoreUrl": {
|
"redisStoreUrl": {
|
||||||
|
@ -423,7 +423,7 @@ class ArgParser {
|
||||||
|
|
||||||
"customBehaviors": {
|
"customBehaviors": {
|
||||||
describe: "injects a custom behavior file or set of behavior files in a directory",
|
describe: "injects a custom behavior file or set of behavior files in a directory",
|
||||||
type: ["string"]
|
type: "string"
|
||||||
},
|
},
|
||||||
|
|
||||||
"debugAccessRedis": {
|
"debugAccessRedis": {
|
||||||
|
@ -433,8 +433,8 @@ class ArgParser {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
parseArgs(argv) {
|
parseArgs(argvParams?: string[]) {
|
||||||
argv = argv || process.argv;
|
let argv = argvParams || process.argv;
|
||||||
|
|
||||||
if (process.env.CRAWL_ARGS) {
|
if (process.env.CRAWL_ARGS) {
|
||||||
argv = argv.concat(this.splitCrawlArgsQuoteSafe(process.env.CRAWL_ARGS));
|
argv = argv.concat(this.splitCrawlArgsQuoteSafe(process.env.CRAWL_ARGS));
|
||||||
|
@ -445,11 +445,12 @@ class ArgParser {
|
||||||
const parsed = yargs(hideBin(argv))
|
const parsed = yargs(hideBin(argv))
|
||||||
.usage("crawler [options]")
|
.usage("crawler [options]")
|
||||||
.option(this.cliOpts)
|
.option(this.cliOpts)
|
||||||
.config("config", "Path to YAML config file", (configPath) => {
|
.config("config", "Path to YAML config file", (configPath : string | number) => {
|
||||||
if (configPath === "/crawls/stdin") {
|
if (configPath === "/crawls/stdin") {
|
||||||
configPath = process.stdin.fd;
|
configPath = process.stdin.fd;
|
||||||
}
|
}
|
||||||
origConfig = yaml.load(fs.readFileSync(configPath, "utf8"));
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any;
|
||||||
return origConfig;
|
return origConfig;
|
||||||
})
|
})
|
||||||
.check((argv) => this.validateArgs(argv))
|
.check((argv) => this.validateArgs(argv))
|
||||||
|
@ -458,13 +459,15 @@ class ArgParser {
|
||||||
return {parsed, origConfig};
|
return {parsed, origConfig};
|
||||||
}
|
}
|
||||||
|
|
||||||
splitCrawlArgsQuoteSafe(crawlArgs) {
|
splitCrawlArgsQuoteSafe(crawlArgs: string) : string[] {
|
||||||
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
|
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
|
||||||
const regex = /"[^"]+"|[^\s]+/g;
|
const regex = /"[^"]+"|[^\s]+/g;
|
||||||
return crawlArgs.match(regex).map(e => e.replace(/"(.+)"/, "$1"));
|
const res = crawlArgs.match(regex);
|
||||||
|
return res ? res.map(e => e.replace(/"(.+)"/, "$1")) : [];
|
||||||
}
|
}
|
||||||
|
|
||||||
validateArgs(argv) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
validateArgs(argv: Record<string, any>) {
|
||||||
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
|
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
|
||||||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||||
|
|
||||||
|
@ -474,15 +477,16 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
// background behaviors to apply
|
// background behaviors to apply
|
||||||
const behaviorOpts = {};
|
const behaviorOpts : {[key: string]: string | boolean} = {};
|
||||||
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
|
argv.behaviors.forEach((x: string) => behaviorOpts[x] = true);
|
||||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||||
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||||
|
|
||||||
argv.text = argv.text || [];
|
argv.text = argv.text || [];
|
||||||
|
|
||||||
if (argv.mobileDevice) {
|
if (argv.mobileDevice) {
|
||||||
argv.emulateDevice = devices[argv.mobileDevice.replace("-", " ")];
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
argv.emulateDevice = (devices as Record<string, any>)[argv.mobileDevice.replace("-", " ")];
|
||||||
if (!argv.emulateDevice) {
|
if (!argv.emulateDevice) {
|
||||||
logger.fatal("Unknown device: " + argv.mobileDevice);
|
logger.fatal("Unknown device: " + argv.mobileDevice);
|
||||||
}
|
}
|
||||||
|
@ -556,6 +560,6 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function parseArgs(argv) {
|
export function parseArgs(argv?: string[]) {
|
||||||
return new ArgParser().parseArgs(argv);
|
return new ArgParser().parseArgs(argv);
|
||||||
}
|
}
|
|
@ -1,6 +1,8 @@
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
import { logger, errJSON } from "./logger.js";
|
import { logger, errJSON } from "./logger.js";
|
||||||
|
import { HTTPRequest, Page } from "puppeteer-core";
|
||||||
|
import { Browser } from "./browser.js";
|
||||||
|
|
||||||
const RULE_TYPES = ["block", "allowOnly"];
|
const RULE_TYPES = ["block", "allowOnly"];
|
||||||
|
|
||||||
|
@ -14,11 +16,23 @@ const BlockState = {
|
||||||
BLOCK_AD: "advertisement"
|
BLOCK_AD: "advertisement"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
type BlockRuleDecl = {
|
||||||
|
url?: string;
|
||||||
|
frameTextMatch?: string;
|
||||||
|
inFrameUrl?: string;
|
||||||
|
type?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class BlockRule
|
class BlockRule
|
||||||
{
|
{
|
||||||
constructor(data) {
|
type: string;
|
||||||
|
url: RegExp | null;
|
||||||
|
frameTextMatch?: RegExp | null;
|
||||||
|
inFrameUrl?: RegExp | null;
|
||||||
|
|
||||||
|
constructor(data: string | BlockRuleDecl) {
|
||||||
if (typeof(data) === "string") {
|
if (typeof(data) === "string") {
|
||||||
this.url = new RegExp(data);
|
this.url = new RegExp(data);
|
||||||
this.type = "block";
|
this.type = "block";
|
||||||
|
@ -49,7 +63,12 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class BlockRules
|
export class BlockRules
|
||||||
{
|
{
|
||||||
constructor(blockRules, blockPutUrl, blockErrMsg) {
|
rules: BlockRule[];
|
||||||
|
blockPutUrl: string;
|
||||||
|
blockErrMsg: string;
|
||||||
|
blockedUrlSet = new Set();
|
||||||
|
|
||||||
|
constructor(blockRules: BlockRuleDecl[], blockPutUrl: string, blockErrMsg: string) {
|
||||||
this.rules = [];
|
this.rules = [];
|
||||||
this.blockPutUrl = blockPutUrl;
|
this.blockPutUrl = blockPutUrl;
|
||||||
this.blockErrMsg = blockErrMsg;
|
this.blockErrMsg = blockErrMsg;
|
||||||
|
@ -68,8 +87,8 @@ export class BlockRules
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async initPage(browser, page) {
|
async initPage(browser: Browser, page: Page) {
|
||||||
const onRequest = async (request) => {
|
const onRequest = async (request: HTTPRequest) => {
|
||||||
const logDetails = {page: page.url()};
|
const logDetails = {page: page.url()};
|
||||||
try {
|
try {
|
||||||
await this.handleRequest(request, logDetails);
|
await this.handleRequest(request, logDetails);
|
||||||
|
@ -80,7 +99,8 @@ export class BlockRules
|
||||||
await browser.interceptRequest(page, onRequest);
|
await browser.interceptRequest(page, onRequest);
|
||||||
}
|
}
|
||||||
|
|
||||||
async handleRequest(request, logDetails) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async handleRequest(request: HTTPRequest, logDetails: Record<string, any>) {
|
||||||
const url = request.url();
|
const url = request.url();
|
||||||
|
|
||||||
let blockState;
|
let blockState;
|
||||||
|
@ -99,7 +119,8 @@ export class BlockRules
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async shouldBlock(request, url, logDetails) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
|
||||||
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
||||||
return BlockState.ALLOW;
|
return BlockState.ALLOW;
|
||||||
}
|
}
|
||||||
|
@ -107,6 +128,9 @@ export class BlockRules
|
||||||
const isNavReq = request.isNavigationRequest();
|
const isNavReq = request.isNavigationRequest();
|
||||||
|
|
||||||
const frame = request.frame();
|
const frame = request.frame();
|
||||||
|
if (!frame) {
|
||||||
|
return BlockState.ALLOW;
|
||||||
|
}
|
||||||
|
|
||||||
let frameUrl = "";
|
let frameUrl = "";
|
||||||
let blockState;
|
let blockState;
|
||||||
|
@ -157,7 +181,8 @@ export class BlockRules
|
||||||
return BlockState.ALLOW;
|
return BlockState.ALLOW;
|
||||||
}
|
}
|
||||||
|
|
||||||
async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq, logDetails) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async ruleCheck(rule: BlockRule, request: HTTPRequest, reqUrl: string, frameUrl: string, isNavReq: boolean, logDetails: Record<string, any>) {
|
||||||
const {url, inFrameUrl, frameTextMatch} = rule;
|
const {url, inFrameUrl, frameTextMatch} = rule;
|
||||||
|
|
||||||
const type = rule.type || "block";
|
const type = rule.type || "block";
|
||||||
|
@ -187,7 +212,8 @@ export class BlockRules
|
||||||
return {block, done: false};
|
return {block, done: false};
|
||||||
}
|
}
|
||||||
|
|
||||||
async isTextMatch(request, reqUrl, frameTextMatch, logDetails) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async isTextMatch(request: HTTPRequest, reqUrl: string, frameTextMatch: RegExp, logDetails: Record<string, any>) {
|
||||||
try {
|
try {
|
||||||
const res = await fetch(reqUrl);
|
const res = await fetch(reqUrl);
|
||||||
const text = await res.text();
|
const text = await res.text();
|
||||||
|
@ -199,7 +225,7 @@ export class BlockRules
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async recordBlockMsg(url) {
|
async recordBlockMsg(url: string) {
|
||||||
if (this.blockedUrlSet.has(url)) {
|
if (this.blockedUrlSet.has(url)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -221,18 +247,21 @@ export class BlockRules
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class AdBlockRules extends BlockRules
|
export class AdBlockRules extends BlockRules
|
||||||
{
|
{
|
||||||
constructor(blockPutUrl, blockErrMsg, adhostsFilePath = "../ad-hosts.json") {
|
adhosts: string[];
|
||||||
|
|
||||||
|
constructor(blockPutUrl: string, blockErrMsg: string, adhostsFilePath = "../../ad-hosts.json") {
|
||||||
super([], blockPutUrl, blockErrMsg);
|
super([], blockPutUrl, blockErrMsg);
|
||||||
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
|
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {"encoding": "utf-8"}));
|
||||||
}
|
}
|
||||||
|
|
||||||
isAdUrl(url) {
|
isAdUrl(url: string) {
|
||||||
const fragments = url.split("/");
|
const fragments = url.split("/");
|
||||||
const domain = fragments.length > 2 ? fragments[2] : null;
|
const domain = fragments.length > 2 ? fragments[2] : null;
|
||||||
return this.adhosts.includes(domain);
|
return domain && this.adhosts.includes(domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
async shouldBlock(request, url, logDetails) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
|
||||||
if (this.isAdUrl(url)) {
|
if (this.isAdUrl(url)) {
|
||||||
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
|
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
|
||||||
await this.recordBlockMsg(url);
|
await this.recordBlockMsg(url);
|
|
@ -9,61 +9,85 @@ import path from "path";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
import { initStorage } from "./storage.js";
|
import { initStorage } from "./storage.js";
|
||||||
|
|
||||||
import puppeteer from "puppeteer-core";
|
import puppeteer, { Frame, HTTPRequest, Page, PuppeteerLaunchOptions, Viewport } from "puppeteer-core";
|
||||||
|
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||||
|
|
||||||
|
type LaunchOpts = {
|
||||||
|
profileUrl: string;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
chromeOptions: Record<string, any>
|
||||||
|
signals: boolean;
|
||||||
|
headless: boolean;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
emulateDevice?: Record<string, any>
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
ondisconnect?: ((err: any) => NonNullable<unknown>) | null
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
// ==================================================================
|
// ==================================================================
|
||||||
export class BaseBrowser
|
export class Browser
|
||||||
{
|
{
|
||||||
|
profileDir: string;
|
||||||
|
customProfile = false;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
emulateDevice: Record<string, any> | null = null;
|
||||||
|
|
||||||
|
browser?: PptrBrowser | null = null;
|
||||||
|
firstCDP: CDPSession | null = null;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
recorders: any[] = [];
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||||
this.customProfile = false;
|
|
||||||
this.emulateDevice = null;
|
|
||||||
|
|
||||||
this.recorders = [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null} = {}) {
|
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null} : LaunchOpts) { if (this.isLaunched()) {
|
||||||
if (this.isLaunched()) {
|
return;
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (profileUrl) {
|
|
||||||
this.customProfile = await this.loadProfile(profileUrl);
|
|
||||||
}
|
|
||||||
|
|
||||||
this.emulateDevice = emulateDevice;
|
|
||||||
|
|
||||||
const args = this.chromeArgs(chromeOptions);
|
|
||||||
|
|
||||||
let defaultViewport = null;
|
|
||||||
|
|
||||||
if (process.env.GEOMETRY) {
|
|
||||||
const geom = process.env.GEOMETRY.split("x");
|
|
||||||
|
|
||||||
defaultViewport = {width: Number(geom[0]), height: Number(geom[1])};
|
|
||||||
}
|
|
||||||
|
|
||||||
const launchOpts = {
|
|
||||||
args,
|
|
||||||
headless: headless ? "new" : false,
|
|
||||||
executablePath: this.getBrowserExe(),
|
|
||||||
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
|
|
||||||
ignoreHTTPSErrors: true,
|
|
||||||
handleSIGHUP: signals,
|
|
||||||
handleSIGINT: signals,
|
|
||||||
handleSIGTERM: signals,
|
|
||||||
protocolTimeout: 0,
|
|
||||||
|
|
||||||
defaultViewport,
|
|
||||||
waitForInitialPage: false,
|
|
||||||
userDataDir: this.profileDir
|
|
||||||
};
|
|
||||||
|
|
||||||
await this._init(launchOpts, ondisconnect);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async setupPage({page}) {
|
if (profileUrl) {
|
||||||
|
this.customProfile = await this.loadProfile(profileUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.emulateDevice = emulateDevice;
|
||||||
|
|
||||||
|
const args = this.chromeArgs(chromeOptions);
|
||||||
|
|
||||||
|
let defaultViewport = null;
|
||||||
|
|
||||||
|
if (process.env.GEOMETRY) {
|
||||||
|
const geom = process.env.GEOMETRY.split("x");
|
||||||
|
|
||||||
|
defaultViewport = {width: Number(geom[0]), height: Number(geom[1])};
|
||||||
|
}
|
||||||
|
|
||||||
|
const launchOpts : PuppeteerLaunchOptions = {
|
||||||
|
args,
|
||||||
|
headless: headless ? "new" : false,
|
||||||
|
executablePath: this.getBrowserExe(),
|
||||||
|
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
|
||||||
|
ignoreHTTPSErrors: true,
|
||||||
|
handleSIGHUP: signals,
|
||||||
|
handleSIGINT: signals,
|
||||||
|
handleSIGTERM: signals,
|
||||||
|
protocolTimeout: 0,
|
||||||
|
|
||||||
|
defaultViewport,
|
||||||
|
waitForInitialPage: false,
|
||||||
|
userDataDir: this.profileDir
|
||||||
|
};
|
||||||
|
|
||||||
|
await this._init(launchOpts, ondisconnect);
|
||||||
|
}
|
||||||
|
|
||||||
|
async setupPage({page} : {page: Page, cdp: CDPSession}) {
|
||||||
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||||
|
|
||||||
if (this.customProfile) {
|
if (this.customProfile) {
|
||||||
|
@ -73,7 +97,7 @@ export class BaseBrowser
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async loadProfile(profileFilename) {
|
async loadProfile(profileFilename: string) : Promise<boolean> {
|
||||||
const targetFilename = "/tmp/profile.tar.gz";
|
const targetFilename = "/tmp/profile.tar.gz";
|
||||||
|
|
||||||
if (profileFilename &&
|
if (profileFilename &&
|
||||||
|
@ -83,16 +107,19 @@ export class BaseBrowser
|
||||||
|
|
||||||
const resp = await fetch(profileFilename);
|
const resp = await fetch(profileFilename);
|
||||||
await pipeline(
|
await pipeline(
|
||||||
Readable.fromWeb(resp.body),
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
Readable.fromWeb(resp.body as any),
|
||||||
fs.createWriteStream(targetFilename)
|
fs.createWriteStream(targetFilename)
|
||||||
);
|
);
|
||||||
|
|
||||||
profileFilename = targetFilename;
|
profileFilename = targetFilename;
|
||||||
} else if (profileFilename && profileFilename.startsWith("@")) {
|
} else if (profileFilename && profileFilename.startsWith("@")) {
|
||||||
const storage = initStorage("");
|
const storage = initStorage();
|
||||||
|
|
||||||
if (!storage) {
|
if (!storage) {
|
||||||
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
await storage.downloadFile(profileFilename.slice(1), targetFilename);
|
await storage.downloadFile(profileFilename.slice(1), targetFilename);
|
||||||
|
@ -112,7 +139,7 @@ export class BaseBrowser
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
saveProfile(profileFilename) {
|
saveProfile(profileFilename: string) {
|
||||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
|
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -142,11 +169,17 @@ export class BaseBrowser
|
||||||
}
|
}
|
||||||
|
|
||||||
getDefaultUA() {
|
getDefaultUA() {
|
||||||
let version = process.env.BROWSER_VERSION;
|
let version : string | undefined = process.env.BROWSER_VERSION;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
version = child_process.execFileSync(this.getBrowserExe(), ["--version"], {encoding: "utf8"});
|
const browser = this.getBrowserExe();
|
||||||
version = version.match(/[\d.]+/)[0];
|
if (browser) {
|
||||||
|
version = child_process.execFileSync(browser, ["--version"], {encoding: "utf8"});
|
||||||
|
const match = version && version.match(/[\d.]+/);
|
||||||
|
if (match) {
|
||||||
|
version = match[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
console.error(e);
|
console.error(e);
|
||||||
}
|
}
|
||||||
|
@ -161,13 +194,13 @@ export class BaseBrowser
|
||||||
return file;
|
return file;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName) {
|
async evaluateWithCLI_(cdp: CDPSession, frame: Frame, cdpContextId: number, funcString: string, logData: Record<string, string>, contextName: string) {
|
||||||
const frameUrl = frame.url();
|
const frameUrl = frame.url();
|
||||||
let details = {frameUrl, ...logData};
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
let details : Record<string, any> = {frameUrl, ...logData};
|
||||||
|
|
||||||
if (!frameUrl || frame.isDetached()) {
|
if (!frameUrl || frame.isDetached()) {
|
||||||
logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName);
|
logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName);
|
||||||
|
@ -201,18 +234,6 @@ export class BaseBrowser
|
||||||
|
|
||||||
return result.value;
|
return result.value;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// ==================================================================
|
|
||||||
export class Browser extends BaseBrowser
|
|
||||||
{
|
|
||||||
constructor() {
|
|
||||||
super();
|
|
||||||
this.browser = null;
|
|
||||||
|
|
||||||
this.firstCDP = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
isLaunched() {
|
isLaunched() {
|
||||||
if (this.browser) {
|
if (this.browser) {
|
||||||
|
@ -231,11 +252,12 @@ export class Browser extends BaseBrowser
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addInitScript(page, script) {
|
addInitScript(page: Page, script: string) {
|
||||||
return page.evaluateOnNewDocument(script);
|
return page.evaluateOnNewDocument(script);
|
||||||
}
|
}
|
||||||
|
|
||||||
async _init(launchOpts, ondisconnect = null) {
|
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||||
|
async _init(launchOpts: PuppeteerLaunchOptions, ondisconnect : Function | null = null) {
|
||||||
this.browser = await puppeteer.launch(launchOpts);
|
this.browser = await puppeteer.launch(launchOpts);
|
||||||
|
|
||||||
const target = this.browser.target();
|
const target = this.browser.target();
|
||||||
|
@ -252,21 +274,29 @@ export class Browser extends BaseBrowser
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async newWindowPageWithCDP() {
|
async newWindowPageWithCDP() : Promise<{cdp: CDPSession, page: Page}> {
|
||||||
// unique url to detect new pages
|
// unique url to detect new pages
|
||||||
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
||||||
|
|
||||||
const p = new Promise((resolve) => {
|
const p = new Promise<Target>((resolve) => {
|
||||||
const listener = (target) => {
|
const listener = (target: Target) => {
|
||||||
if (target.url() === startPage) {
|
if (target.url() === startPage) {
|
||||||
resolve(target);
|
resolve(target);
|
||||||
this.browser.removeListener("targetcreated", listener);
|
if (this.browser) {
|
||||||
|
this.browser.removeListener("targetcreated", listener);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
this.browser.on("targetcreated", listener);
|
if (this.browser) {
|
||||||
|
this.browser.on("targetcreated", listener);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (!this.firstCDP) {
|
||||||
|
throw new Error("CDP missing");
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -283,12 +313,17 @@ export class Browser extends BaseBrowser
|
||||||
const target = await p;
|
const target = await p;
|
||||||
|
|
||||||
const page = await target.page();
|
const page = await target.page();
|
||||||
|
if (!page) {
|
||||||
|
throw new Error("page missing");
|
||||||
|
}
|
||||||
|
|
||||||
const device = this.emulateDevice;
|
const device = this.emulateDevice;
|
||||||
|
|
||||||
if (device) {
|
if (device && page) {
|
||||||
if (device.viewport && device.userAgent) {
|
if (device.viewport && device.userAgent) {
|
||||||
await page.emulate(device);
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
await page.emulate(device as any);
|
||||||
} else if (device.userAgent) {
|
} else if (device.userAgent) {
|
||||||
await page.setUserAgent(device.userAgent);
|
await page.setUserAgent(device.userAgent);
|
||||||
}
|
}
|
||||||
|
@ -300,9 +335,17 @@ export class Browser extends BaseBrowser
|
||||||
}
|
}
|
||||||
|
|
||||||
async serviceWorkerFetch() {
|
async serviceWorkerFetch() {
|
||||||
|
if (!this.firstCDP) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
this.firstCDP.on("Fetch.requestPaused", async (params) => {
|
this.firstCDP.on("Fetch.requestPaused", async (params) => {
|
||||||
const { frameId, requestId, networkId, request } = params;
|
const { frameId, requestId, networkId, request } = params;
|
||||||
|
|
||||||
|
if (!this.firstCDP) {
|
||||||
|
throw new Error("CDP missing");
|
||||||
|
}
|
||||||
|
|
||||||
if (networkId) {
|
if (networkId) {
|
||||||
try {
|
try {
|
||||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
||||||
|
@ -343,30 +386,44 @@ export class Browser extends BaseBrowser
|
||||||
await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]});
|
await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]});
|
||||||
}
|
}
|
||||||
|
|
||||||
async evaluateWithCLI(_, frame, cdp, funcString, logData, contextName) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
const context = await frame.executionContext();
|
|
||||||
|
async evaluateWithCLI(
|
||||||
|
_: unknown,
|
||||||
|
frame: Frame,
|
||||||
|
cdp: CDPSession,
|
||||||
|
funcString: string,
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
logData: Record<string, any>,
|
||||||
|
contextName: string
|
||||||
|
) {
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const context = await (frame as any).executionContext();
|
||||||
cdp = context._client;
|
cdp = context._client;
|
||||||
const cdpContextId = context._contextId;
|
const cdpContextId = context._contextId;
|
||||||
return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName);
|
return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName);
|
||||||
}
|
}
|
||||||
|
|
||||||
interceptRequest(page, callback) {
|
interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
|
||||||
page.on("request", callback);
|
page.on("request", callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
async waitForNetworkIdle(page, params) {
|
async waitForNetworkIdle(page: Page, params: {timeout?: number}) {
|
||||||
return await page.waitForNetworkIdle(params);
|
return await page.waitForNetworkIdle(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
async setViewport(page, params) {
|
async setViewport(page: Page, params: Viewport) {
|
||||||
await page.setViewport(params);
|
await page.setViewport(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
async getCookies(page) {
|
async getCookies(page: Page) {
|
||||||
return await page.cookies();
|
return await page.cookies();
|
||||||
}
|
}
|
||||||
|
|
||||||
async setCookies(page, cookies) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async setCookies(page: Page, cookies: any) {
|
||||||
return await page.setCookie(...cookies);
|
return await page.setCookie(...cookies);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -3,7 +3,7 @@ import path from "path";
|
||||||
|
|
||||||
const MAX_DEPTH = 2;
|
const MAX_DEPTH = 2;
|
||||||
|
|
||||||
export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
|
export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0) : string[] {
|
||||||
const resolvedPath = path.resolve(fileOrDir);
|
const resolvedPath = path.resolve(fileOrDir);
|
||||||
|
|
||||||
if (depth >= MAX_DEPTH) {
|
if (depth >= MAX_DEPTH) {
|
||||||
|
@ -13,14 +13,14 @@ export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
|
||||||
|
|
||||||
const stat = fs.statSync(resolvedPath);
|
const stat = fs.statSync(resolvedPath);
|
||||||
|
|
||||||
if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) {
|
if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
|
||||||
const contents = fs.readFileSync(resolvedPath);
|
const contents = fs.readFileSync(resolvedPath);
|
||||||
return [`/* src: ${resolvedPath} */\n\n${contents}`];
|
return [`/* src: ${resolvedPath} */\n\n${contents}`];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stat.isDirectory) {
|
if (stat.isDirectory()) {
|
||||||
const files = fs.readdirSync(resolvedPath);
|
const files = fs.readdirSync(resolvedPath);
|
||||||
return files.reduce((acc, next) => {
|
return files.reduce((acc: string[], next: string) => {
|
||||||
const nextPath = path.join(fileOrDir, next);
|
const nextPath = path.join(fileOrDir, next);
|
||||||
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
|
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
|
||||||
}, []);
|
}, []);
|
||||||
|
@ -28,6 +28,7 @@ export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
|
||||||
|
|
||||||
if (depth === 0) {
|
if (depth === 0) {
|
||||||
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
|
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
|
||||||
return [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
}
|
}
|
|
@ -6,9 +6,14 @@ import { logger } from "./logger.js";
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class HealthChecker
|
export class HealthChecker
|
||||||
{
|
{
|
||||||
constructor(port, errorThreshold) {
|
port: number;
|
||||||
|
errorThreshold: number;
|
||||||
|
healthServer: http.Server;
|
||||||
|
|
||||||
|
errorCount = 0;
|
||||||
|
|
||||||
|
constructor(port: number, errorThreshold: number) {
|
||||||
this.port = port;
|
this.port = port;
|
||||||
this.errorCount = 0;
|
|
||||||
this.errorThreshold = errorThreshold;
|
this.errorThreshold = errorThreshold;
|
||||||
|
|
||||||
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
|
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
|
||||||
|
@ -16,8 +21,8 @@ export class HealthChecker
|
||||||
this.healthServer.listen(port);
|
this.healthServer.listen(port);
|
||||||
}
|
}
|
||||||
|
|
||||||
async healthCheck(req, res) {
|
async healthCheck(req: http.IncomingMessage, res: http.ServerResponse) {
|
||||||
const pathname = url.parse(req.url).pathname;
|
const pathname = req.url ? url.parse(req.url).pathname : "";
|
||||||
switch (pathname) {
|
switch (pathname) {
|
||||||
case "/healthz":
|
case "/healthz":
|
||||||
if (this.errorCount < this.errorThreshold) {
|
if (this.errorCount < this.errorThreshold) {
|
|
@ -1,57 +1,73 @@
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
// to fix serialization of regexes for logging purposes
|
// to fix serialization of regexes for logging purposes
|
||||||
RegExp.prototype.toJSON = RegExp.prototype.toString;
|
|
||||||
|
import { Writable } from "node:stream";
|
||||||
|
import { RedisCrawlState } from "./state.js";
|
||||||
|
|
||||||
|
// RegExp.prototype.toJSON = RegExp.prototype.toString;
|
||||||
|
Object.defineProperty(RegExp.prototype, "toJSON", { value: RegExp.prototype.toString });
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export function errJSON(e) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
return {"type": "exception", "message": e.message, "stack": e.stack};
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
export function errJSON(e: any) {
|
||||||
|
if (e instanceof Error) {
|
||||||
|
return {"type": "exception", "message": e.message, "stack": e.stack};
|
||||||
|
} else {
|
||||||
|
return {"message": e.toString()};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class Logger
|
class Logger
|
||||||
{
|
{
|
||||||
constructor() {
|
logStream : Writable | null = null;
|
||||||
this.logStream = null;
|
debugLogging = false;
|
||||||
this.debugLogging = null;
|
logErrorsToRedis = false;
|
||||||
this.logErrorsToRedis = false;
|
logLevels : string[] = [];
|
||||||
this.logLevels = [];
|
contexts : string[] = [];
|
||||||
this.contexts = [];
|
crawlState? : RedisCrawlState | null = null;
|
||||||
this.crawlState = null;
|
fatalExitCode = 17;
|
||||||
|
|
||||||
this.fatalExitCode = 17;
|
setDefaultFatalExitCode(exitCode: number) {
|
||||||
}
|
|
||||||
|
|
||||||
setDefaultFatalExitCode(exitCode) {
|
|
||||||
this.fatalExitCode = exitCode;
|
this.fatalExitCode = exitCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
setExternalLogStream(logFH) {
|
setExternalLogStream(logFH: Writable | null) {
|
||||||
this.logStream = logFH;
|
this.logStream = logFH;
|
||||||
}
|
}
|
||||||
|
|
||||||
setDebugLogging(debugLog) {
|
setDebugLogging(debugLog: boolean) {
|
||||||
this.debugLogging = debugLog;
|
this.debugLogging = debugLog;
|
||||||
}
|
}
|
||||||
|
|
||||||
setLogErrorsToRedis(logErrorsToRedis) {
|
setLogErrorsToRedis(logErrorsToRedis: boolean) {
|
||||||
this.logErrorsToRedis = logErrorsToRedis;
|
this.logErrorsToRedis = logErrorsToRedis;
|
||||||
}
|
}
|
||||||
|
|
||||||
setLogLevel(logLevels) {
|
setLogLevel(logLevels: string[]) {
|
||||||
this.logLevels = logLevels;
|
this.logLevels = logLevels;
|
||||||
}
|
}
|
||||||
|
|
||||||
setContext(contexts) {
|
setContext(contexts: string[]) {
|
||||||
this.contexts = contexts;
|
this.contexts = contexts;
|
||||||
}
|
}
|
||||||
|
|
||||||
setCrawlState(crawlState) {
|
setCrawlState(crawlState: RedisCrawlState) {
|
||||||
this.crawlState = crawlState;
|
this.crawlState = crawlState;
|
||||||
}
|
}
|
||||||
|
|
||||||
logAsJSON(message, data, context, logLevel="info") {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
|
||||||
|
logAsJSON(
|
||||||
|
message: string,
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
data: Record<string, string> | Error | any,
|
||||||
|
context: string,
|
||||||
|
logLevel="info"
|
||||||
|
) {
|
||||||
if (data instanceof Error) {
|
if (data instanceof Error) {
|
||||||
data = errJSON(data);
|
data = errJSON(data);
|
||||||
} else if (typeof data !== "object") {
|
} else if (typeof data !== "object") {
|
||||||
|
@ -70,7 +86,7 @@ class Logger
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let dataToLog = {
|
const dataToLog = {
|
||||||
"timestamp": new Date().toISOString(),
|
"timestamp": new Date().toISOString(),
|
||||||
"logLevel": logLevel,
|
"logLevel": logLevel,
|
||||||
"context": context,
|
"context": context,
|
||||||
|
@ -84,30 +100,30 @@ class Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
const toLogToRedis = ["error", "fatal"];
|
const toLogToRedis = ["error", "fatal"];
|
||||||
if (this.logErrorsToRedis && toLogToRedis.includes(logLevel)) {
|
if (this.logErrorsToRedis && this.crawlState && toLogToRedis.includes(logLevel)) {
|
||||||
this.crawlState.logError(string);
|
this.crawlState.logError(string);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info(message, data={}, context="general") {
|
info(message: string, data={}, context="general") {
|
||||||
this.logAsJSON(message, data, context);
|
this.logAsJSON(message, data, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
error(message, data={}, context="general") {
|
error(message: string, data={}, context="general") {
|
||||||
this.logAsJSON(message, data, context, "error");
|
this.logAsJSON(message, data, context, "error");
|
||||||
}
|
}
|
||||||
|
|
||||||
warn(message, data={}, context="general") {
|
warn(message: string, data={}, context="general") {
|
||||||
this.logAsJSON(message, data, context, "warn");
|
this.logAsJSON(message, data, context, "warn");
|
||||||
}
|
}
|
||||||
|
|
||||||
debug(message, data={}, context="general") {
|
debug(message: string, data={}, context="general") {
|
||||||
if (this.debugLogging) {
|
if (this.debugLogging) {
|
||||||
this.logAsJSON(message, data, context, "debug");
|
this.logAsJSON(message, data, context, "debug");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fatal(message, data={}, context="general", exitCode=0) {
|
fatal(message: string, data={}, context="general", exitCode=0) {
|
||||||
exitCode = exitCode || this.fatalExitCode;
|
exitCode = exitCode || this.fatalExitCode;
|
||||||
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
|
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
|
||||||
|
|
|
@ -1,10 +1,14 @@
|
||||||
|
import { HTTPRequest, Page } from "puppeteer-core";
|
||||||
import { errJSON, logger } from "./logger.js";
|
import { errJSON, logger } from "./logger.js";
|
||||||
|
import { Browser } from "./browser.js";
|
||||||
|
|
||||||
export class OriginOverride
|
export class OriginOverride
|
||||||
{
|
{
|
||||||
constructor(originOverride) {
|
originOverride: {origUrl: URL, destUrl: URL}[];
|
||||||
|
|
||||||
|
constructor(originOverride: string[]) {
|
||||||
this.originOverride = originOverride.map((override) => {
|
this.originOverride = originOverride.map((override) => {
|
||||||
let [orig, dest] = override.split("=");
|
const [orig, dest] = override.split("=");
|
||||||
const origUrl = new URL(orig);
|
const origUrl = new URL(orig);
|
||||||
const destUrl = new URL(dest);
|
const destUrl = new URL(dest);
|
||||||
|
|
||||||
|
@ -12,8 +16,8 @@ export class OriginOverride
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async initPage(browser, page) {
|
async initPage(browser: Browser, page: Page) {
|
||||||
const onRequest = async (request) => {
|
const onRequest = async (request: HTTPRequest) => {
|
||||||
try {
|
try {
|
||||||
const url = request.url();
|
const url = request.url();
|
||||||
|
|
||||||
|
@ -28,12 +32,13 @@ export class OriginOverride
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!newUrl) {
|
if (!newUrl || !orig) {
|
||||||
request.continue({}, -1);
|
request.continue({}, -1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const headers = new Headers(request.headers());
|
const headers = new Headers(request.headers());
|
||||||
|
|
||||||
headers.set("host", orig.host);
|
headers.set("host", orig.host);
|
||||||
if (headers.get("origin")) {
|
if (headers.get("origin")) {
|
||||||
headers.set("origin", orig.origin);
|
headers.set("origin", orig.origin);
|
|
@ -10,12 +10,16 @@ import { logger, errJSON } from "./logger.js";
|
||||||
import { sleep, timestampNow } from "./timing.js";
|
import { sleep, timestampNow } from "./timing.js";
|
||||||
import { RequestResponseInfo } from "./reqresp.js";
|
import { RequestResponseInfo } from "./reqresp.js";
|
||||||
|
|
||||||
|
// @ts-expect-error TODO fill in why error is expected
|
||||||
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
|
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
|
||||||
|
// @ts-expect-error TODO fill in why error is expected
|
||||||
import { rewriteDASH, rewriteHLS } from "@webrecorder/wabac/src/rewrite/rewriteVideo.js";
|
import { rewriteDASH, rewriteHLS } from "@webrecorder/wabac/src/rewrite/rewriteVideo.js";
|
||||||
|
|
||||||
import { WARCRecord } from "warcio";
|
import { WARCRecord } from "warcio";
|
||||||
import { WARCSerializer } from "warcio/node";
|
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
||||||
import { WARCWriter } from "./warcwriter.js";
|
import { WARCWriter } from "./warcwriter.js";
|
||||||
|
import { RedisCrawlState, WorkerId } from "./state.js";
|
||||||
|
import { CDPSession, Protocol } from "puppeteer-core";
|
||||||
|
|
||||||
const MAX_BROWSER_FETCH_SIZE = 2_000_000;
|
const MAX_BROWSER_FETCH_SIZE = 2_000_000;
|
||||||
const MAX_NETWORK_LOAD_SIZE = 200_000_000;
|
const MAX_NETWORK_LOAD_SIZE = 200_000_000;
|
||||||
|
@ -26,15 +30,58 @@ const WRITE_DUPE_KEY = "s:writedupe";
|
||||||
|
|
||||||
const encoder = new TextEncoder();
|
const encoder = new TextEncoder();
|
||||||
|
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
function logNetwork(/*msg, data*/) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars
|
||||||
|
function logNetwork(msg: string, data: any) {
|
||||||
// logger.debug(msg, data, "recorderNetwork");
|
// logger.debug(msg, data, "recorderNetwork");
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
export class Recorder
|
export class Recorder
|
||||||
{
|
{
|
||||||
constructor({workerid, collDir, crawler}) {
|
workerid: WorkerId;
|
||||||
|
collDir: string;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
crawler: any;
|
||||||
|
|
||||||
|
crawlState: RedisCrawlState;
|
||||||
|
|
||||||
|
warcQ: PQueue;
|
||||||
|
fetcherQ: PQueue;
|
||||||
|
|
||||||
|
pendingRequests!: Map<string, RequestResponseInfo>;
|
||||||
|
skipIds!: Set<string>;
|
||||||
|
|
||||||
|
swSessionId?: string | null;
|
||||||
|
swFrameIds = new Set<string>();
|
||||||
|
swUrls = new Set<string>();
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
logDetails: Record<string, any> = {};
|
||||||
|
skipping = false;
|
||||||
|
|
||||||
|
allowFull206 = false;
|
||||||
|
|
||||||
|
archivesDir: string;
|
||||||
|
tempdir: string;
|
||||||
|
tempCdxDir: string;
|
||||||
|
|
||||||
|
gzip = true;
|
||||||
|
|
||||||
|
writer: WARCWriter;
|
||||||
|
|
||||||
|
pageid!: string;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
{workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: any}
|
||||||
|
) {
|
||||||
this.workerid = workerid;
|
this.workerid = workerid;
|
||||||
this.crawler = crawler;
|
this.crawler = crawler;
|
||||||
this.crawlState = crawler.crawlState;
|
this.crawlState = crawler.crawlState;
|
||||||
|
@ -43,19 +90,6 @@ export class Recorder
|
||||||
|
|
||||||
this.fetcherQ = new PQueue({concurrency: 1});
|
this.fetcherQ = new PQueue({concurrency: 1});
|
||||||
|
|
||||||
this.pendingRequests = null;
|
|
||||||
this.skipIds = null;
|
|
||||||
|
|
||||||
this.swSessionId = null;
|
|
||||||
this.swFrameIds = new Set();
|
|
||||||
this.swUrls = new Set();
|
|
||||||
|
|
||||||
|
|
||||||
this.logDetails = {};
|
|
||||||
this.skipping = false;
|
|
||||||
|
|
||||||
this.allowFull206 = true;
|
|
||||||
|
|
||||||
this.collDir = collDir;
|
this.collDir = collDir;
|
||||||
|
|
||||||
this.archivesDir = path.join(this.collDir, "archive");
|
this.archivesDir = path.join(this.collDir, "archive");
|
||||||
|
@ -68,7 +102,6 @@ export class Recorder
|
||||||
|
|
||||||
const crawlId = process.env.CRAWL_ID || os.hostname();
|
const crawlId = process.env.CRAWL_ID || os.hostname();
|
||||||
const filename = `rec-${crawlId}-${timestampNow()}-${this.workerid}.warc`;
|
const filename = `rec-${crawlId}-${timestampNow()}-${this.workerid}.warc`;
|
||||||
this.gzip = true;
|
|
||||||
|
|
||||||
this.writer = new WARCWriter({
|
this.writer = new WARCWriter({
|
||||||
archivesDir: this.archivesDir,
|
archivesDir: this.archivesDir,
|
||||||
|
@ -79,7 +112,7 @@ export class Recorder
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async onCreatePage({cdp}) {
|
async onCreatePage({cdp} : {cdp: CDPSession}) {
|
||||||
// Fetch
|
// Fetch
|
||||||
|
|
||||||
cdp.on("Fetch.requestPaused", async (params) => {
|
cdp.on("Fetch.requestPaused", async (params) => {
|
||||||
|
@ -159,7 +192,7 @@ export class Recorder
|
||||||
await cdp.send("Target.setAutoAttach", {autoAttach: true, waitForDebuggerOnStart: false, flatten: true});
|
await cdp.send("Target.setAutoAttach", {autoAttach: true, waitForDebuggerOnStart: false, flatten: true});
|
||||||
}
|
}
|
||||||
|
|
||||||
handleResponseReceived(params) {
|
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
|
||||||
const { requestId, response } = params;
|
const { requestId, response } = params;
|
||||||
|
|
||||||
const reqresp = this.pendingReqResp(requestId);
|
const reqresp = this.pendingReqResp(requestId);
|
||||||
|
@ -170,7 +203,7 @@ export class Recorder
|
||||||
reqresp.fillResponse(response);
|
reqresp.fillResponse(response);
|
||||||
}
|
}
|
||||||
|
|
||||||
handleRequestExtraInfo(params) {
|
handleRequestExtraInfo(params: Protocol.Network.RequestWillBeSentExtraInfoEvent) {
|
||||||
if (!this.shouldSkip(params.headers)) {
|
if (!this.shouldSkip(params.headers)) {
|
||||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||||
if (reqresp) {
|
if (reqresp) {
|
||||||
|
@ -179,13 +212,13 @@ export class Recorder
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
handleRedirectResponse(params) {
|
handleRedirectResponse(params: Protocol.Network.RequestWillBeSentEvent) {
|
||||||
const { requestId, redirectResponse } = params;
|
const { requestId, redirectResponse } = params;
|
||||||
|
|
||||||
// remove and serialize, but allow reusing requestId
|
// remove and serialize, but allow reusing requestId
|
||||||
// as redirect chain may reuse same requestId for subsequent request
|
// as redirect chain may reuse same requestId for subsequent request
|
||||||
const reqresp = this.removeReqResp(requestId, true);
|
const reqresp = this.removeReqResp(requestId, true);
|
||||||
if (!reqresp) {
|
if (!reqresp || !redirectResponse) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,7 +232,7 @@ export class Recorder
|
||||||
this.serializeToWARC(reqresp);
|
this.serializeToWARC(reqresp);
|
||||||
}
|
}
|
||||||
|
|
||||||
handleLoadingFailed(params) {
|
handleLoadingFailed(params: Protocol.Network.LoadingFailedEvent) {
|
||||||
const { errorText, type, requestId } = params;
|
const { errorText, type, requestId } = params;
|
||||||
|
|
||||||
const reqresp = this.pendingReqResp(requestId, true);
|
const reqresp = this.pendingReqResp(requestId, true);
|
||||||
|
@ -211,13 +244,13 @@ export class Recorder
|
||||||
|
|
||||||
switch (errorText) {
|
switch (errorText) {
|
||||||
case "net::ERR_BLOCKED_BY_CLIENT":
|
case "net::ERR_BLOCKED_BY_CLIENT":
|
||||||
logNetwork("Request blocked", {url, errorText, ...this.logDetails}, "recorder");
|
logNetwork("Request blocked", {url, errorText, ...this.logDetails});
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "net::ERR_ABORTED":
|
case "net::ERR_ABORTED":
|
||||||
// check if this is a false positive -- a valid download that's already been fetched
|
// check if this is a false positive -- a valid download that's already been fetched
|
||||||
// the abort is just for page, but download will succeed
|
// the abort is just for page, but download will succeed
|
||||||
if (url && type === "Document" && reqresp.isValidBinary()) {
|
if (type === "Document" && reqresp.isValidBinary()) {
|
||||||
this.serializeToWARC(reqresp);
|
this.serializeToWARC(reqresp);
|
||||||
//} else if (url) {
|
//} else if (url) {
|
||||||
} else if (url && reqresp.requestHeaders && reqresp.requestHeaders["x-browsertrix-fetch"]) {
|
} else if (url && reqresp.requestHeaders && reqresp.requestHeaders["x-browsertrix-fetch"]) {
|
||||||
|
@ -235,7 +268,7 @@ export class Recorder
|
||||||
this.removeReqResp(requestId);
|
this.removeReqResp(requestId);
|
||||||
}
|
}
|
||||||
|
|
||||||
handleLoadingFinished(params) {
|
handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
|
||||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||||
|
|
||||||
if (!reqresp || reqresp.asyncLoading) {
|
if (!reqresp || reqresp.asyncLoading) {
|
||||||
|
@ -251,7 +284,7 @@ export class Recorder
|
||||||
this.serializeToWARC(reqresp);
|
this.serializeToWARC(reqresp);
|
||||||
}
|
}
|
||||||
|
|
||||||
async handleRequestPaused(params, cdp, isSWorker = false) {
|
async handleRequestPaused(params: Protocol.Fetch.RequestPausedEvent, cdp: CDPSession, isSWorker = false) {
|
||||||
const { requestId, request, responseStatusCode, responseErrorReason, resourceType, networkId } = params;
|
const { requestId, request, responseStatusCode, responseErrorReason, resourceType, networkId } = params;
|
||||||
const { method, headers, url } = request;
|
const { method, headers, url } = request;
|
||||||
|
|
||||||
|
@ -276,7 +309,7 @@ export class Recorder
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async handleFetchResponse(params, cdp, isSWorker) {
|
async handleFetchResponse(params: Protocol.Fetch.RequestPausedEvent, cdp: CDPSession, isSWorker: boolean) {
|
||||||
const { request } = params;
|
const { request } = params;
|
||||||
const { url } = request;
|
const { url } = request;
|
||||||
const {requestId, responseErrorReason, responseStatusCode, responseHeaders} = params;
|
const {requestId, responseErrorReason, responseStatusCode, responseHeaders} = params;
|
||||||
|
@ -341,7 +374,7 @@ export class Recorder
|
||||||
|
|
||||||
// if not consumed via takeStream, attempt async loading
|
// if not consumed via takeStream, attempt async loading
|
||||||
if (!streamingConsume) {
|
if (!streamingConsume) {
|
||||||
let fetcher = null;
|
let fetcher : AsyncFetcher;
|
||||||
|
|
||||||
if (reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
|
if (reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
|
||||||
fetcher = new AsyncFetcher(opts);
|
fetcher = new AsyncFetcher(opts);
|
||||||
|
@ -388,12 +421,12 @@ export class Recorder
|
||||||
try {
|
try {
|
||||||
await cdp.send("Fetch.fulfillRequest", {
|
await cdp.send("Fetch.fulfillRequest", {
|
||||||
requestId,
|
requestId,
|
||||||
responseCode: responseStatusCode,
|
responseCode: responseStatusCode || 0,
|
||||||
responseHeaders,
|
responseHeaders,
|
||||||
body
|
body
|
||||||
});
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
const type = reqresp.type;
|
const type = reqresp.resourceType;
|
||||||
if (type === "Document") {
|
if (type === "Document") {
|
||||||
logger.debug("document not loaded in browser, possibly other URLs missing", {url, type: reqresp.resourceType}, "recorder");
|
logger.debug("document not loaded in browser, possibly other URLs missing", {url, type: reqresp.resourceType}, "recorder");
|
||||||
} else {
|
} else {
|
||||||
|
@ -404,7 +437,7 @@ export class Recorder
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
startPage({pageid, url}) {
|
startPage({pageid, url} : {pageid: string, url: string}) {
|
||||||
this.pageid = pageid;
|
this.pageid = pageid;
|
||||||
this.logDetails = {page: url, workerid: this.workerid};
|
this.logDetails = {page: url, workerid: this.workerid};
|
||||||
if (this.pendingRequests && this.pendingRequests.size) {
|
if (this.pendingRequests && this.pendingRequests.size) {
|
||||||
|
@ -431,8 +464,8 @@ export class Recorder
|
||||||
while (numPending && !this.crawler.interrupted) {
|
while (numPending && !this.crawler.interrupted) {
|
||||||
const pending = [];
|
const pending = [];
|
||||||
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
||||||
const url = reqresp.url;
|
const url = reqresp.url || "";
|
||||||
const entry = {requestId, url};
|
const entry : {requestId: string, url: string, expectedSize?: number, readSize?: number} = {requestId, url};
|
||||||
if (reqresp.expectedSize) {
|
if (reqresp.expectedSize) {
|
||||||
entry.expectedSize = reqresp.expectedSize;
|
entry.expectedSize = reqresp.expectedSize;
|
||||||
}
|
}
|
||||||
|
@ -464,7 +497,7 @@ export class Recorder
|
||||||
await this.writer.flush();
|
await this.writer.flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
shouldSkip(headers, url, method, resourceType) {
|
shouldSkip(headers: Protocol.Network.Headers, url?: string, method?: string, resourceType?: string) {
|
||||||
if (headers && !method) {
|
if (headers && !method) {
|
||||||
method = headers[":method"];
|
method = headers[":method"];
|
||||||
}
|
}
|
||||||
|
@ -477,7 +510,7 @@ export class Recorder
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (["EventSource", "WebSocket", "Ping"].includes(resourceType)) {
|
if (["EventSource", "WebSocket", "Ping"].includes(resourceType || "")) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -494,7 +527,7 @@ export class Recorder
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
async rewriteResponse(reqresp) {
|
async rewriteResponse(reqresp: RequestResponseInfo) {
|
||||||
const { url, responseHeadersList, extraOpts, payload } = reqresp;
|
const { url, responseHeadersList, extraOpts, payload } = reqresp;
|
||||||
|
|
||||||
if (!payload || !payload.length) {
|
if (!payload || !payload.length) {
|
||||||
|
@ -509,12 +542,12 @@ export class Recorder
|
||||||
switch (ct) {
|
switch (ct) {
|
||||||
case "application/x-mpegURL":
|
case "application/x-mpegURL":
|
||||||
case "application/vnd.apple.mpegurl":
|
case "application/vnd.apple.mpegurl":
|
||||||
string = payload.toString("utf-8");
|
string = payload.toString();
|
||||||
newString = rewriteHLS(string, {save: extraOpts});
|
newString = rewriteHLS(string, {save: extraOpts});
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "application/dash+xml":
|
case "application/dash+xml":
|
||||||
string = payload.toString("utf-8");
|
string = payload.toString();
|
||||||
newString = rewriteDASH(string, {save: extraOpts});
|
newString = rewriteDASH(string, {save: extraOpts});
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -526,7 +559,7 @@ export class Recorder
|
||||||
const rw = baseDSRules.getRewriter(url);
|
const rw = baseDSRules.getRewriter(url);
|
||||||
|
|
||||||
if (rw !== baseDSRules.defaultRewriter) {
|
if (rw !== baseDSRules.defaultRewriter) {
|
||||||
string = payload.toString("utf-8");
|
string = payload.toString();
|
||||||
newString = rw.rewrite(string, {live: true, save: extraOpts});
|
newString = rw.rewrite(string, {live: true, save: extraOpts});
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -549,8 +582,11 @@ export class Recorder
|
||||||
//return Buffer.from(newString).toString("base64");
|
//return Buffer.from(newString).toString("base64");
|
||||||
}
|
}
|
||||||
|
|
||||||
_getContentType(headers) {
|
_getContentType(headers? : Protocol.Fetch.HeaderEntry[] | {name: string, value: string}[]) {
|
||||||
for (let header of headers) {
|
if (!headers) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
for (const header of headers) {
|
||||||
if (header.name.toLowerCase() === "content-type") {
|
if (header.name.toLowerCase() === "content-type") {
|
||||||
return header.value.split(";")[0];
|
return header.value.split(";")[0];
|
||||||
}
|
}
|
||||||
|
@ -559,8 +595,11 @@ export class Recorder
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
_getContentLen(headers) {
|
_getContentLen(headers? : Protocol.Fetch.HeaderEntry[]) {
|
||||||
for (let header of headers) {
|
if (!headers) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
for (const header of headers) {
|
||||||
if (header.name.toLowerCase() === "content-length") {
|
if (header.name.toLowerCase() === "content-length") {
|
||||||
return Number(header.value);
|
return Number(header.value);
|
||||||
}
|
}
|
||||||
|
@ -569,8 +608,11 @@ export class Recorder
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
_getContentRange(headers) {
|
_getContentRange(headers? : Protocol.Fetch.HeaderEntry[]) {
|
||||||
for (let header of headers) {
|
if (!headers) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
for (const header of headers) {
|
||||||
if (header.name.toLowerCase() === "content-range") {
|
if (header.name.toLowerCase() === "content-range") {
|
||||||
return header.value;
|
return header.value;
|
||||||
}
|
}
|
||||||
|
@ -579,15 +621,15 @@ export class Recorder
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
noResponseForStatus(status) {
|
noResponseForStatus(status: number | undefined | null) {
|
||||||
return (!status || status === 204 || (status >= 300 && status < 400));
|
return (!status || status === 204 || (status >= 300 && status < 400));
|
||||||
}
|
}
|
||||||
|
|
||||||
isValidUrl(url) {
|
isValidUrl(url?: string) {
|
||||||
return url && (url.startsWith("https:") || url.startsWith("http:"));
|
return url && (url.startsWith("https:") || url.startsWith("http:"));
|
||||||
}
|
}
|
||||||
|
|
||||||
pendingReqResp(requestId, reuseOnly = false) {
|
pendingReqResp(requestId: string, reuseOnly = false) {
|
||||||
if (!this.pendingRequests.has(requestId)) {
|
if (!this.pendingRequests.has(requestId)) {
|
||||||
if (reuseOnly || !requestId) {
|
if (reuseOnly || !requestId) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -605,14 +647,14 @@ export class Recorder
|
||||||
return reqresp;
|
return reqresp;
|
||||||
} else {
|
} else {
|
||||||
const reqresp = this.pendingRequests.get(requestId);
|
const reqresp = this.pendingRequests.get(requestId);
|
||||||
if (requestId !== reqresp.requestId) {
|
if (reqresp && requestId !== reqresp.requestId) {
|
||||||
logger.warn("Invalid request id", {requestId, actualRequestId: reqresp.requestId}, "recorder");
|
logger.warn("Invalid request id", {requestId, actualRequestId: reqresp.requestId}, "recorder");
|
||||||
}
|
}
|
||||||
return reqresp;
|
return reqresp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
removeReqResp(requestId, allowReuse=false) {
|
removeReqResp(requestId: string, allowReuse=false) {
|
||||||
const reqresp = this.pendingRequests.get(requestId);
|
const reqresp = this.pendingRequests.get(requestId);
|
||||||
this.pendingRequests.delete(requestId);
|
this.pendingRequests.delete(requestId);
|
||||||
if (!allowReuse) {
|
if (!allowReuse) {
|
||||||
|
@ -621,13 +663,13 @@ export class Recorder
|
||||||
return reqresp;
|
return reqresp;
|
||||||
}
|
}
|
||||||
|
|
||||||
async serializeToWARC(reqresp) {
|
async serializeToWARC(reqresp: RequestResponseInfo) {
|
||||||
if (!reqresp.payload) {
|
if (!reqresp.payload) {
|
||||||
logNetwork("Not writing, no payload", {url: reqresp.url});
|
logNetwork("Not writing, no payload", {url: reqresp.url});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (reqresp.method === "GET" && !await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url)) {
|
if (reqresp.url && reqresp.method === "GET" && !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))) {
|
||||||
logNetwork("Skipping dupe", {url: reqresp.url});
|
logNetwork("Skipping dupe", {url: reqresp.url});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -638,21 +680,21 @@ export class Recorder
|
||||||
this.warcQ.add(() => this.writer.writeRecordPair(responseRecord, requestRecord));
|
this.warcQ.add(() => this.writer.writeRecordPair(responseRecord, requestRecord));
|
||||||
}
|
}
|
||||||
|
|
||||||
async directFetchCapture(url) {
|
async directFetchCapture(url: string) : Promise<{fetched: boolean, mime: string}>{
|
||||||
const reqresp = new RequestResponseInfo(0);
|
const reqresp = new RequestResponseInfo("0");
|
||||||
reqresp.url = url;
|
reqresp.url = url;
|
||||||
reqresp.method = "GET";
|
reqresp.method = "GET";
|
||||||
|
|
||||||
logger.debug("Directly fetching page URL without browser", {url, ...this.logDetails}, "recorder");
|
logger.debug("Directly fetching page URL without browser", {url, ...this.logDetails}, "recorder");
|
||||||
|
|
||||||
const filter = (resp) => resp.status === 200 && !resp.headers.get("set-cookie");
|
const filter = (resp: Response) => resp.status === 200 && !resp.headers.get("set-cookie");
|
||||||
|
|
||||||
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
|
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
|
||||||
// should not get here, as dupe pages tracked via seen list
|
// should not get here, as dupe pages tracked via seen list
|
||||||
const fetcher = new AsyncFetcher({tempdir: this.tempdir, reqresp, recorder: this, networkId: 0, filter, ignoreDupe: true});
|
const fetcher = new AsyncFetcher({tempdir: this.tempdir, reqresp, recorder: this, networkId: "0", filter, ignoreDupe: true});
|
||||||
const res = await fetcher.load();
|
const res = await fetcher.load();
|
||||||
|
|
||||||
const mime = reqresp && reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"].split(";")[0];
|
const mime = reqresp && reqresp.responseHeaders && reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"].split(";")[0] || "";
|
||||||
|
|
||||||
return {fetched: res === "fetched", mime};
|
return {fetched: res === "fetched", mime};
|
||||||
}
|
}
|
||||||
|
@ -661,7 +703,20 @@ export class Recorder
|
||||||
// =================================================================
|
// =================================================================
|
||||||
class AsyncFetcher
|
class AsyncFetcher
|
||||||
{
|
{
|
||||||
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = null, ignoreDupe = false}) {
|
reqresp: RequestResponseInfo;
|
||||||
|
|
||||||
|
networkId: string;
|
||||||
|
filter?: (resp: Response) => boolean;
|
||||||
|
ignoreDupe = false;
|
||||||
|
|
||||||
|
recorder: Recorder;
|
||||||
|
|
||||||
|
tempdir: string;
|
||||||
|
filename: string;
|
||||||
|
|
||||||
|
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = undefined, ignoreDupe = false} :
|
||||||
|
{tempdir: string, reqresp: RequestResponseInfo, expectedSize?: number, recorder: Recorder,
|
||||||
|
networkId: string, filter?: (resp: Response) => boolean, ignoreDupe?: boolean }) {
|
||||||
this.reqresp = reqresp;
|
this.reqresp = reqresp;
|
||||||
this.reqresp.expectedSize = expectedSize;
|
this.reqresp.expectedSize = expectedSize;
|
||||||
this.reqresp.asyncLoading = true;
|
this.reqresp.asyncLoading = true;
|
||||||
|
@ -685,7 +740,7 @@ class AsyncFetcher
|
||||||
let fetched = "notfetched";
|
let fetched = "notfetched";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (reqresp.method === "GET" && !await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url)) {
|
if (reqresp.method === "GET" && url && !(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url))) {
|
||||||
if (!this.ignoreDupe) {
|
if (!this.ignoreDupe) {
|
||||||
this.reqresp.asyncLoading = false;
|
this.reqresp.asyncLoading = false;
|
||||||
return "dupe";
|
return "dupe";
|
||||||
|
@ -719,7 +774,7 @@ class AsyncFetcher
|
||||||
//return fetched;
|
//return fetched;
|
||||||
}
|
}
|
||||||
|
|
||||||
const externalBuffer = serializer.externalBuffer;
|
const externalBuffer : TempFileBuffer = serializer.externalBuffer as TempFileBuffer;
|
||||||
|
|
||||||
if (externalBuffer) {
|
if (externalBuffer) {
|
||||||
const { currSize, buffers, fh } = externalBuffer;
|
const { currSize, buffers, fh } = externalBuffer;
|
||||||
|
@ -731,14 +786,14 @@ class AsyncFetcher
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Object.keys(reqresp.extraOpts).length) {
|
if (Object.keys(reqresp.extraOpts).length) {
|
||||||
responseRecord.warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
|
responseRecord.warcHeaders.headers.set("WARC-JSON-Metadata", JSON.stringify(reqresp.extraOpts));
|
||||||
}
|
}
|
||||||
|
|
||||||
recorder.warcQ.add(() => recorder.writer.writeRecordPair(responseRecord, requestRecord, serializer));
|
recorder.warcQ.add(() => recorder.writer.writeRecordPair(responseRecord, requestRecord, serializer));
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Streaming Fetch Error", {url, networkId, filename, ...errJSON(e), ...logDetails}, "recorder");
|
logger.error("Streaming Fetch Error", {url, networkId, filename, ...errJSON(e), ...logDetails}, "recorder");
|
||||||
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
|
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!);
|
||||||
} finally {
|
} finally {
|
||||||
recorder.removeReqResp(networkId);
|
recorder.removeReqResp(networkId);
|
||||||
}
|
}
|
||||||
|
@ -761,9 +816,9 @@ class AsyncFetcher
|
||||||
signal = abort.signal;
|
signal = abort.signal;
|
||||||
}
|
}
|
||||||
|
|
||||||
const resp = await fetch(url, {method, headers, body: reqresp.postData || undefined, signal});
|
const resp = await fetch(url!, {method, headers, body: reqresp.postData || undefined, signal});
|
||||||
|
|
||||||
if (this.filter && !this.filter(resp)) {
|
if (this.filter && !this.filter(resp) && abort) {
|
||||||
abort.abort();
|
abort.abort();
|
||||||
throw new Error("invalid response, ignoring fetch");
|
throw new Error("invalid response, ignoring fetch");
|
||||||
}
|
}
|
||||||
|
@ -778,7 +833,7 @@ class AsyncFetcher
|
||||||
|
|
||||||
} else if (!resp.body) {
|
} else if (!resp.body) {
|
||||||
logger.error("Empty body, stopping fetch", {url}, "recorder");
|
logger.error("Empty body, stopping fetch", {url}, "recorder");
|
||||||
await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
|
await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -787,7 +842,7 @@ class AsyncFetcher
|
||||||
return this.takeReader(resp.body.getReader());
|
return this.takeReader(resp.body.getReader());
|
||||||
}
|
}
|
||||||
|
|
||||||
async* takeReader(reader) {
|
async* takeReader(reader: ReadableStreamDefaultReader<Uint8Array>) {
|
||||||
try {
|
try {
|
||||||
while (true) {
|
while (true) {
|
||||||
const { value, done } = await reader.read();
|
const { value, done } = await reader.read();
|
||||||
|
@ -803,7 +858,7 @@ class AsyncFetcher
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async* takeStreamIter(cdp, stream) {
|
async* takeStreamIter(cdp: CDPSession, stream: Protocol.IO.StreamHandle) {
|
||||||
try {
|
try {
|
||||||
while (true) {
|
while (true) {
|
||||||
const {data, base64Encoded, eof} = await cdp.send("IO.read", {handle: stream});
|
const {data, base64Encoded, eof} = await cdp.send("IO.read", {handle: stream});
|
||||||
|
@ -825,7 +880,12 @@ class AsyncFetcher
|
||||||
// =================================================================
|
// =================================================================
|
||||||
class ResponseStreamAsyncFetcher extends AsyncFetcher
|
class ResponseStreamAsyncFetcher extends AsyncFetcher
|
||||||
{
|
{
|
||||||
constructor(opts) {
|
cdp: CDPSession;
|
||||||
|
requestId: string;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
constructor(opts: any) {
|
||||||
super(opts);
|
super(opts);
|
||||||
this.cdp = opts.cdp;
|
this.cdp = opts.cdp;
|
||||||
this.requestId = opts.requestId;
|
this.requestId = opts.requestId;
|
||||||
|
@ -845,7 +905,11 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher
|
||||||
// =================================================================
|
// =================================================================
|
||||||
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
|
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
|
||||||
{
|
{
|
||||||
constructor(opts) {
|
cdp: CDPSession;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
constructor(opts: any) {
|
||||||
super(opts);
|
super(opts);
|
||||||
this.cdp = opts.cdp;
|
this.cdp = opts.cdp;
|
||||||
}
|
}
|
||||||
|
@ -883,7 +947,7 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
reqresp.status = httpStatusCode;
|
reqresp.status = httpStatusCode || 0;
|
||||||
reqresp.responseHeaders = headers || {};
|
reqresp.responseHeaders = headers || {};
|
||||||
|
|
||||||
return this.takeStreamIter(cdp, stream);
|
return this.takeStreamIter(cdp, stream);
|
||||||
|
@ -892,15 +956,15 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
// response
|
// response
|
||||||
function createResponse(reqresp, pageid, contentIter) {
|
function createResponse(reqresp: RequestResponseInfo, pageid: string, contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>) {
|
||||||
const url = reqresp.url;
|
const url = reqresp.url;
|
||||||
const warcVersion = "WARC/1.1";
|
const warcVersion = "WARC/1.1";
|
||||||
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
|
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
|
||||||
const date = new Date().toISOString();
|
const date = new Date().toISOString();
|
||||||
|
|
||||||
const httpHeaders = reqresp.getResponseHeadersDict(reqresp.payload ? reqresp.payload.length : null);
|
const httpHeaders = reqresp.getResponseHeadersDict(reqresp.payload ? reqresp.payload.length : 0);
|
||||||
|
|
||||||
const warcHeaders = {
|
const warcHeaders : Record<string, string> = {
|
||||||
"WARC-Page-ID": pageid,
|
"WARC-Page-ID": pageid,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -909,7 +973,7 @@ function createResponse(reqresp, pageid, contentIter) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!contentIter) {
|
if (!contentIter) {
|
||||||
contentIter = [reqresp.payload];
|
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Object.keys(reqresp.extraOpts).length) {
|
if (Object.keys(reqresp.extraOpts).length) {
|
||||||
|
@ -923,7 +987,7 @@ function createResponse(reqresp, pageid, contentIter) {
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
// request
|
// request
|
||||||
function createRequest(reqresp, responseRecord, pageid) {
|
function createRequest(reqresp: RequestResponseInfo, responseRecord: WARCRecord, pageid: string) {
|
||||||
const url = reqresp.url;
|
const url = reqresp.url;
|
||||||
const warcVersion = "WARC/1.1";
|
const warcVersion = "WARC/1.1";
|
||||||
const method = reqresp.method;
|
const method = reqresp.method;
|
||||||
|
@ -936,12 +1000,12 @@ function createRequest(reqresp, responseRecord, pageid) {
|
||||||
|
|
||||||
const httpHeaders = reqresp.getRequestHeadersDict();
|
const httpHeaders = reqresp.getRequestHeadersDict();
|
||||||
|
|
||||||
const warcHeaders = {
|
const warcHeaders : Record<string, string> = {
|
||||||
"WARC-Concurrent-To": responseRecord.warcHeader("WARC-Record-ID"),
|
"WARC-Concurrent-To": responseRecord.warcHeader("WARC-Record-ID")!,
|
||||||
"WARC-Page-ID": pageid,
|
"WARC-Page-ID": pageid,
|
||||||
};
|
};
|
||||||
|
|
||||||
const date = responseRecord.warcDate;
|
const date = responseRecord.warcDate || undefined;
|
||||||
|
|
||||||
return WARCRecord.create({
|
return WARCRecord.create({
|
||||||
url, date, warcVersion, type: "request", warcHeaders,
|
url, date, warcVersion, type: "request", warcHeaders,
|
|
@ -1,4 +1,4 @@
|
||||||
import Redis from "ioredis";
|
import { Redis } from "ioredis";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
const error = console.error;
|
const error = console.error;
|
||||||
|
@ -15,7 +15,7 @@ console.error = function (...args) {
|
||||||
args[0].indexOf("[ioredis] Unhandled error event") === 0
|
args[0].indexOf("[ioredis] Unhandled error event") === 0
|
||||||
) {
|
) {
|
||||||
|
|
||||||
let now = Date.now();
|
const now = Date.now();
|
||||||
|
|
||||||
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
||||||
if (lastLogTime && exitOnError) {
|
if (lastLogTime && exitOnError) {
|
||||||
|
@ -29,7 +29,7 @@ console.error = function (...args) {
|
||||||
error.call(console, ...args);
|
error.call(console, ...args);
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function initRedis(url) {
|
export async function initRedis(url: string) {
|
||||||
const redis = new Redis(url, {lazyConnect: true});
|
const redis = new Redis(url, {lazyConnect: true});
|
||||||
await redis.connect();
|
await redis.connect();
|
||||||
return redis;
|
return redis;
|
|
@ -1,5 +1,8 @@
|
||||||
|
// @ts-expect-error TODO fill in why error is expected
|
||||||
import { getStatusText } from "@webrecorder/wabac/src/utils.js";
|
import { getStatusText } from "@webrecorder/wabac/src/utils.js";
|
||||||
|
|
||||||
|
import { Protocol } from "puppeteer-core";
|
||||||
|
|
||||||
const CONTENT_LENGTH = "content-length";
|
const CONTENT_LENGTH = "content-length";
|
||||||
const CONTENT_TYPE = "content-type";
|
const CONTENT_TYPE = "content-type";
|
||||||
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
||||||
|
@ -8,53 +11,63 @@ const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class RequestResponseInfo
|
export class RequestResponseInfo
|
||||||
{
|
{
|
||||||
constructor(requestId) {
|
_created: Date = new Date();
|
||||||
this._created = new Date();
|
|
||||||
|
requestId: string;
|
||||||
|
|
||||||
|
ts?: string;
|
||||||
|
|
||||||
|
method?: string;
|
||||||
|
url!: string;
|
||||||
|
protocol?: string = "HTTP/1.1";
|
||||||
|
|
||||||
|
// request data
|
||||||
|
requestHeaders?: Record<string, string>;
|
||||||
|
requestHeadersText?: string;
|
||||||
|
|
||||||
|
postData?: string;
|
||||||
|
hasPostData: boolean = false;
|
||||||
|
|
||||||
|
// response data
|
||||||
|
status: number = 0;
|
||||||
|
statusText?: string;
|
||||||
|
|
||||||
|
responseHeaders?: Record<string, string>;
|
||||||
|
responseHeadersList?: {name: string, value: string}[];
|
||||||
|
responseHeadersText?: string;
|
||||||
|
|
||||||
|
payload?: Uint8Array;
|
||||||
|
|
||||||
|
// misc
|
||||||
|
fromServiceWorker: boolean = false;
|
||||||
|
|
||||||
|
frameId?: string;
|
||||||
|
|
||||||
|
fetch: boolean = false;
|
||||||
|
|
||||||
|
resourceType?: string;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
extraOpts: Record<string, any> = {};
|
||||||
|
|
||||||
|
// stats
|
||||||
|
readSize: number = 0;
|
||||||
|
expectedSize: number = 0;
|
||||||
|
|
||||||
|
// set to true to indicate async loading in progress
|
||||||
|
asyncLoading: boolean = false;
|
||||||
|
|
||||||
|
// set to add truncated message
|
||||||
|
truncated?: string;
|
||||||
|
|
||||||
|
constructor(requestId: string) {
|
||||||
this.requestId = requestId;
|
this.requestId = requestId;
|
||||||
|
|
||||||
this.ts = null;
|
|
||||||
|
|
||||||
// request data
|
|
||||||
this.method = null;
|
|
||||||
this.url = null;
|
|
||||||
this.protocol = "HTTP/1.1";
|
|
||||||
|
|
||||||
this.requestHeaders = null;
|
|
||||||
this.requestHeadersText = null;
|
|
||||||
|
|
||||||
this.postData = null;
|
|
||||||
this.hasPostData = false;
|
|
||||||
|
|
||||||
// response data
|
|
||||||
this.status = 0;
|
|
||||||
this.statusText = null;
|
|
||||||
|
|
||||||
this.responseHeaders = null;
|
|
||||||
this.responseHeadersList = null;
|
|
||||||
this.responseHeadersText = null;
|
|
||||||
|
|
||||||
this.payload = null;
|
|
||||||
|
|
||||||
this.fromServiceWorker = false;
|
|
||||||
|
|
||||||
this.fetch = false;
|
|
||||||
|
|
||||||
this.resourceType = null;
|
|
||||||
|
|
||||||
this.extraOpts = {};
|
|
||||||
|
|
||||||
this.readSize = 0;
|
|
||||||
this.expectedSize = 0;
|
|
||||||
|
|
||||||
// set to true to indicate async loading in progress
|
|
||||||
this.asyncLoading = false;
|
|
||||||
|
|
||||||
// set to add truncated message
|
|
||||||
this.truncated = null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fillRequest(params) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
fillRequest(params: Record<string, any>) {
|
||||||
this.url = params.request.url;
|
this.url = params.request.url;
|
||||||
this.method = params.request.method;
|
this.method = params.request.method;
|
||||||
if (!this.requestHeaders) {
|
if (!this.requestHeaders) {
|
||||||
|
@ -69,7 +82,9 @@ export class RequestResponseInfo
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fillFetchRequestPaused(params) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
fillFetchRequestPaused(params: Record<string, any>) {
|
||||||
this.fillRequest(params);
|
this.fillRequest(params);
|
||||||
|
|
||||||
this.status = params.responseStatusCode;
|
this.status = params.responseStatusCode;
|
||||||
|
@ -83,7 +98,7 @@ export class RequestResponseInfo
|
||||||
this.frameId = params.frameId;
|
this.frameId = params.frameId;
|
||||||
}
|
}
|
||||||
|
|
||||||
fillResponse(response) {
|
fillResponse(response: Protocol.Network.Response) {
|
||||||
// if initial fetch was a 200, but now replacing with 304, don't!
|
// if initial fetch was a 200, but now replacing with 304, don't!
|
||||||
if (response.status == 304 && this.status && this.status != 304 && this.url) {
|
if (response.status == 304 && this.status && this.status != 304 && this.url) {
|
||||||
return;
|
return;
|
||||||
|
@ -112,8 +127,8 @@ export class RequestResponseInfo
|
||||||
this.fromServiceWorker = !!response.fromServiceWorker;
|
this.fromServiceWorker = !!response.fromServiceWorker;
|
||||||
|
|
||||||
if (response.securityDetails) {
|
if (response.securityDetails) {
|
||||||
const issuer = response.securityDetails.issuer || "";
|
const issuer : string = response.securityDetails.issuer || "";
|
||||||
const ctc = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0";
|
const ctc : string = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0";
|
||||||
this.extraOpts.cert = {issuer, ctc};
|
this.extraOpts.cert = {issuer, ctc};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -124,14 +139,15 @@ export class RequestResponseInfo
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
const headers = new Headers(this.responseHeaders);
|
const headers = new Headers(this.responseHeaders);
|
||||||
const redirUrl = new URL(headers.get("location"), this.url).href;
|
const location = headers.get("location") || "";
|
||||||
|
const redirUrl = new URL(location, this.url).href;
|
||||||
return this.url === redirUrl;
|
return this.url === redirUrl;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fillResponseReceivedExtraInfo(params) {
|
fillResponseReceivedExtraInfo(params: Record<string, string>) {
|
||||||
// this.responseHeaders = params.headers;
|
// this.responseHeaders = params.headers;
|
||||||
// if (params.headersText) {
|
// if (params.headersText) {
|
||||||
// this.responseHeadersText = params.headersText;
|
// this.responseHeadersText = params.headersText;
|
||||||
|
@ -139,22 +155,28 @@ export class RequestResponseInfo
|
||||||
this.extraOpts.ipType = params.resourceIPAddressSpace;
|
this.extraOpts.ipType = params.resourceIPAddressSpace;
|
||||||
}
|
}
|
||||||
|
|
||||||
fillFetchResponse(response) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
fillFetchResponse(response: Record<string, any>) {
|
||||||
this.responseHeaders = Object.fromEntries(response.headers);
|
this.responseHeaders = Object.fromEntries(response.headers);
|
||||||
this.status = response.status;
|
this.status = response.status;
|
||||||
this.statusText = response.statusText || getStatusText(this.status);
|
this.statusText = response.statusText || getStatusText(this.status);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fillRequestExtraInfo(params) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
fillRequestExtraInfo(params: Record<string, any>) {
|
||||||
this.requestHeaders = params.headers;
|
this.requestHeaders = params.headers;
|
||||||
}
|
}
|
||||||
|
|
||||||
getResponseHeadersText() {
|
getResponseHeadersText() {
|
||||||
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
|
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
|
||||||
|
|
||||||
for (const header of Object.keys(this.responseHeaders)) {
|
if (this.responseHeaders) {
|
||||||
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`;
|
for (const header of Object.keys(this.responseHeaders)) {
|
||||||
|
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
headers += "\r\n";
|
headers += "\r\n";
|
||||||
return headers;
|
return headers;
|
||||||
|
@ -165,14 +187,14 @@ export class RequestResponseInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
getRequestHeadersDict() {
|
getRequestHeadersDict() {
|
||||||
return this._getHeadersDict(this.requestHeaders, null);
|
return this._getHeadersDict(this.requestHeaders);
|
||||||
}
|
}
|
||||||
|
|
||||||
getResponseHeadersDict(length) {
|
getResponseHeadersDict(length = 0) {
|
||||||
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
|
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
_getHeadersDict(headersDict, headersList, actualContentLength) {
|
_getHeadersDict(headersDict?: Record<string, string>, headersList?: {name: string, value: string}[], actualContentLength = 0) {
|
||||||
if (!headersDict && headersList) {
|
if (!headersDict && headersList) {
|
||||||
headersDict = {};
|
headersDict = {};
|
||||||
|
|
|
@ -1,29 +1,39 @@
|
||||||
import ws from "ws";
|
import ws, { WebSocket } from "ws";
|
||||||
import http from "http";
|
import http, { IncomingMessage, ServerResponse } from "http";
|
||||||
import url from "url";
|
import url from "url";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
import { initRedis } from "./redis.js";
|
import { initRedis } from "./redis.js";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
import { Duplex } from "stream";
|
||||||
|
import { CDPSession, Page } from "puppeteer-core";
|
||||||
|
import { WorkerId } from "./state.js";
|
||||||
|
|
||||||
const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
const indexHTML = fs.readFileSync(new URL("../../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class WSTransport
|
class WSTransport
|
||||||
{
|
{
|
||||||
constructor(port) {
|
allWS = new Set<WebSocket>();
|
||||||
this.allWS = new Set();
|
// eslint-disable-next-line no-use-before-define
|
||||||
|
caster!: ScreenCaster;
|
||||||
|
wss: ws.Server;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
httpServer: any;
|
||||||
|
|
||||||
|
|
||||||
this.caster = null;
|
constructor(port: number) {
|
||||||
|
this.allWS = new Set();
|
||||||
|
|
||||||
this.wss = new ws.Server({ noServer: true });
|
this.wss = new ws.Server({ noServer: true });
|
||||||
|
|
||||||
this.wss.on("connection", (ws) => this.initWebSocket(ws));
|
this.wss.on("connection", (ws: WebSocket) => this.initWebSocket(ws));
|
||||||
|
|
||||||
this.httpServer = http.createServer((...args) => this.handleRequest(...args));
|
this.httpServer = http.createServer((...args) => this.handleRequest(...args));
|
||||||
this.httpServer.on("upgrade", (request, socket, head) => {
|
this.httpServer.on("upgrade", (request: IncomingMessage, socket: Duplex, head: Buffer) => {
|
||||||
const pathname = url.parse(request.url).pathname;
|
const pathname = url.parse(request.url || "").pathname;
|
||||||
|
|
||||||
if (pathname === "/ws") {
|
if (pathname === "/ws") {
|
||||||
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
||||||
|
@ -35,8 +45,8 @@ class WSTransport
|
||||||
this.httpServer.listen(port);
|
this.httpServer.listen(port);
|
||||||
}
|
}
|
||||||
|
|
||||||
async handleRequest(req, res) {
|
async handleRequest(req: IncomingMessage, res: ServerResponse) {
|
||||||
const pathname = url.parse(req.url).pathname;
|
const pathname = url.parse(req.url || "").pathname;
|
||||||
switch (pathname) {
|
switch (pathname) {
|
||||||
case "/":
|
case "/":
|
||||||
res.writeHead(200, {"Content-Type": "text/html"});
|
res.writeHead(200, {"Content-Type": "text/html"});
|
||||||
|
@ -48,7 +58,7 @@ class WSTransport
|
||||||
res.end("Not Found");
|
res.end("Not Found");
|
||||||
}
|
}
|
||||||
|
|
||||||
initWebSocket(ws) {
|
initWebSocket(ws: WebSocket) {
|
||||||
for (const packet of this.caster.iterCachedData()) {
|
for (const packet of this.caster.iterCachedData()) {
|
||||||
ws.send(JSON.stringify(packet));
|
ws.send(JSON.stringify(packet));
|
||||||
}
|
}
|
||||||
|
@ -71,10 +81,12 @@ class WSTransport
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
sendAll(packet) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
packet = JSON.stringify(packet);
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
sendAll(packet: Record<any, any>) {
|
||||||
|
const packetStr = JSON.stringify(packet);
|
||||||
for (const ws of this.allWS) {
|
for (const ws of this.allWS) {
|
||||||
ws.send(packet);
|
ws.send(packetStr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,22 +99,30 @@ class WSTransport
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class RedisPubSubTransport
|
class RedisPubSubTransport
|
||||||
{
|
{
|
||||||
constructor(redisUrl, crawlId) {
|
numConnections: number = 0;
|
||||||
this.numConnections = 0;
|
castChannel: string;
|
||||||
|
// eslint-disable-next-line no-use-before-define
|
||||||
|
caster!: ScreenCaster;
|
||||||
|
ctrlChannel: string;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
redis: any;
|
||||||
|
|
||||||
|
constructor(redisUrl: string, crawlId: string) {
|
||||||
this.castChannel = `c:${crawlId}:cast`;
|
this.castChannel = `c:${crawlId}:cast`;
|
||||||
this.ctrlChannel = `c:${crawlId}:ctrl`;
|
this.ctrlChannel = `c:${crawlId}:ctrl`;
|
||||||
|
|
||||||
this.init(redisUrl);
|
this.init(redisUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
async init(redisUrl) {
|
async init(redisUrl: string) {
|
||||||
this.redis = await initRedis(redisUrl);
|
this.redis = await initRedis(redisUrl);
|
||||||
|
|
||||||
const subRedis = await initRedis(redisUrl);
|
const subRedis = await initRedis(redisUrl);
|
||||||
|
|
||||||
await subRedis.subscribe(this.ctrlChannel);
|
await subRedis.subscribe(this.ctrlChannel);
|
||||||
|
|
||||||
subRedis.on("message", async (channel, message) => {
|
subRedis.on("message", async (channel: string, message: string) => {
|
||||||
if (channel !== this.ctrlChannel) {
|
if (channel !== this.ctrlChannel) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -129,7 +149,9 @@ class RedisPubSubTransport
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async sendAll(packet) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async sendAll(packet: Record<any, any>) {
|
||||||
await this.redis.publish(this.castChannel, JSON.stringify(packet));
|
await this.redis.publish(this.castChannel, JSON.stringify(packet));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -143,19 +165,20 @@ class RedisPubSubTransport
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class ScreenCaster
|
class ScreenCaster
|
||||||
{
|
{
|
||||||
constructor(transport, numWorkers) {
|
transport: WSTransport;
|
||||||
|
caches = new Map<WorkerId, string>();
|
||||||
|
urls = new Map<WorkerId, string>();
|
||||||
|
cdps = new Map<WorkerId, CDPSession>();
|
||||||
|
maxWidth = 640;
|
||||||
|
maxHeight = 480;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
initMsg: {[key: string]: any};
|
||||||
|
|
||||||
|
constructor(transport: WSTransport, numWorkers: number) {
|
||||||
this.transport = transport;
|
this.transport = transport;
|
||||||
this.transport.caster = this;
|
this.transport.caster = this;
|
||||||
|
|
||||||
this.caches = new Map();
|
|
||||||
this.urls = new Map();
|
|
||||||
|
|
||||||
this.cdps = new Map();
|
|
||||||
|
|
||||||
// todo: make customizable
|
|
||||||
this.maxWidth = 640;
|
|
||||||
this.maxHeight = 480;
|
|
||||||
|
|
||||||
this.initMsg = {
|
this.initMsg = {
|
||||||
msg: "init",
|
msg: "init",
|
||||||
width: this.maxWidth,
|
width: this.maxWidth,
|
||||||
|
@ -174,7 +197,7 @@ class ScreenCaster
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async screencastPage(page, cdp, id) {
|
async screencastPage(page: Page, cdp: CDPSession, id: WorkerId) {
|
||||||
this.urls.set(id, page.url());
|
this.urls.set(id, page.url());
|
||||||
|
|
||||||
// shouldn't happen, getting duplicate cdp
|
// shouldn't happen, getting duplicate cdp
|
||||||
|
@ -220,7 +243,7 @@ class ScreenCaster
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async stopById(id, sendClose=false) {
|
async stopById(id: WorkerId, sendClose=false) {
|
||||||
this.caches.delete(id);
|
this.caches.delete(id);
|
||||||
this.urls.delete(id);
|
this.urls.delete(id);
|
||||||
|
|
||||||
|
@ -241,24 +264,32 @@ class ScreenCaster
|
||||||
this.cdps.delete(id);
|
this.cdps.delete(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
async startCast(cdp, id) {
|
async startCast(cdp: CDPSession, id: WorkerId) {
|
||||||
if (cdp._startedCast) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
if ((cdp as any)._startedCast) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
cdp._startedCast = true;
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
(cdp as any)._startedCast = true;
|
||||||
|
|
||||||
logger.info("Started Screencast", {workerid: id}, "screencast");
|
logger.info("Started Screencast", {workerid: id}, "screencast");
|
||||||
|
|
||||||
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
|
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
|
||||||
}
|
}
|
||||||
|
|
||||||
async stopCast(cdp, id) {
|
async stopCast(cdp: CDPSession, id: WorkerId) {
|
||||||
if (!cdp._startedCast) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
if (!(cdp as any)._startedCast) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
cdp._startedCast = false;
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
(cdp as any)._startedCast = false;
|
||||||
|
|
||||||
logger.info("Stopping Screencast", {workerid: id}, "screencast");
|
logger.info("Stopping Screencast", {workerid: id}, "screencast");
|
||||||
|
|
|
@ -2,11 +2,18 @@ import sharp from "sharp";
|
||||||
|
|
||||||
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
||||||
import { logger, errJSON } from "./logger.js";
|
import { logger, errJSON } from "./logger.js";
|
||||||
|
import { Browser } from "./browser.js";
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
export const screenshotTypes = {
|
type ScreenShotType = {
|
||||||
|
type: string;
|
||||||
|
omitBackground: boolean;
|
||||||
|
fullPage: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const screenshotTypes : Record<string, ScreenShotType> = {
|
||||||
"view": {
|
"view": {
|
||||||
type: "png",
|
type: "png",
|
||||||
omitBackground: true,
|
omitBackground: true,
|
||||||
|
@ -24,10 +31,15 @@ export const screenshotTypes = {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
export class Screenshots extends WARCResourceWriter {
|
export class Screenshots extends WARCResourceWriter {
|
||||||
|
browser: Browser;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
page: any;
|
||||||
|
|
||||||
constructor(opts) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
constructor(opts: any) {
|
||||||
super({...opts, warcName: "screenshots.warc.gz"});
|
super({...opts, warcName: "screenshots.warc.gz"});
|
||||||
this.browser = opts.browser;
|
this.browser = opts.browser;
|
||||||
this.page = opts.page;
|
this.page = opts.page;
|
|
@ -1,10 +1,34 @@
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
import { MAX_DEPTH } from "./constants.js";
|
import { MAX_DEPTH } from "./constants.js";
|
||||||
|
|
||||||
|
type ScopeType =
|
||||||
|
| "prefix"
|
||||||
|
| "host"
|
||||||
|
| "domain"
|
||||||
|
| "page"
|
||||||
|
| "page-spa"
|
||||||
|
| "any"
|
||||||
|
| "custom";
|
||||||
|
|
||||||
export class ScopedSeed
|
export class ScopedSeed
|
||||||
{
|
{
|
||||||
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
|
url: string;
|
||||||
|
scopeType: ScopeType;
|
||||||
|
include: RegExp[];
|
||||||
|
exclude: RegExp[] = [];
|
||||||
|
allowHash = false;
|
||||||
|
depth = -1;
|
||||||
|
sitemap?: string | null;
|
||||||
|
extraHops = 0;
|
||||||
|
|
||||||
|
maxExtraHops = 0;
|
||||||
|
maxDepth = 0;
|
||||||
|
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
{url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} :
|
||||||
|
{url: string, scopeType: ScopeType, include: string[], exclude?: string[], allowHash?: boolean, depth?: number, sitemap?: string | boolean | null, extraHops?: number}
|
||||||
|
) {
|
||||||
const parsedUrl = this.parseUrl(url);
|
const parsedUrl = this.parseUrl(url);
|
||||||
if (!parsedUrl) {
|
if (!parsedUrl) {
|
||||||
throw new Error("Invalid URL");
|
throw new Error("Invalid URL");
|
||||||
|
@ -19,8 +43,9 @@ export class ScopedSeed
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.scopeType !== "custom") {
|
if (this.scopeType !== "custom") {
|
||||||
[include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
|
const [includeNew, allowHashNew] = this.scopeFromType(this.scopeType, parsedUrl);
|
||||||
this.include = [...include, ...this.include];
|
this.include = [...includeNew, ...this.include];
|
||||||
|
allowHash = allowHashNew;
|
||||||
}
|
}
|
||||||
|
|
||||||
// for page scope, the depth is set to extraHops, as no other
|
// for page scope, the depth is set to extraHops, as no other
|
||||||
|
@ -35,7 +60,10 @@ export class ScopedSeed
|
||||||
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
|
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
|
||||||
}
|
}
|
||||||
|
|
||||||
parseRx(value) {
|
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
parseRx(value : any) {
|
||||||
if (value === null || value === undefined || value === "") {
|
if (value === null || value === undefined || value === "") {
|
||||||
return [];
|
return [];
|
||||||
} else if (!(value instanceof Array)) {
|
} else if (!(value instanceof Array)) {
|
||||||
|
@ -45,7 +73,7 @@ export class ScopedSeed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addExclusion(value) {
|
addExclusion(value: string | RegExp) {
|
||||||
if (!value) {
|
if (!value) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -55,7 +83,7 @@ export class ScopedSeed
|
||||||
this.exclude.push(value);
|
this.exclude.push(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
removeExclusion(value) {
|
removeExclusion(value: string) {
|
||||||
for (let i = 0; i < this.exclude.length; i++) {
|
for (let i = 0; i < this.exclude.length; i++) {
|
||||||
if (this.exclude[i].toString() == value.toString()) {
|
if (this.exclude[i].toString() == value.toString()) {
|
||||||
this.exclude.splice(i, 1);
|
this.exclude.splice(i, 1);
|
||||||
|
@ -64,7 +92,7 @@ export class ScopedSeed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
parseUrl(url, logDetails = {}) {
|
parseUrl(url: string, logDetails = {}) {
|
||||||
let parsedUrl = null;
|
let parsedUrl = null;
|
||||||
try {
|
try {
|
||||||
parsedUrl = new URL(url.trim());
|
parsedUrl = new URL(url.trim());
|
||||||
|
@ -81,18 +109,21 @@ export class ScopedSeed
|
||||||
return parsedUrl;
|
return parsedUrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
resolveSiteMap(sitemap) {
|
resolveSiteMap(sitemap: boolean | string | null) : string | null {
|
||||||
if (sitemap === true) {
|
if (sitemap === true) {
|
||||||
const url = new URL(this.url);
|
const url = new URL(this.url);
|
||||||
url.pathname = "/sitemap.xml";
|
url.pathname = "/sitemap.xml";
|
||||||
return url.href;
|
return url.href;
|
||||||
|
} else if (typeof(sitemap) === "string") {
|
||||||
|
const url = new URL(sitemap, this.url);
|
||||||
|
return url.href;
|
||||||
}
|
}
|
||||||
|
|
||||||
return sitemap;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
scopeFromType(scopeType, parsedUrl) {
|
scopeFromType(scopeType: ScopeType, parsedUrl: URL) : [RegExp[], boolean] {
|
||||||
let include;
|
let include : RegExp[] = [];
|
||||||
let allowHash = false;
|
let allowHash = false;
|
||||||
|
|
||||||
switch (scopeType) {
|
switch (scopeType) {
|
||||||
|
@ -132,26 +163,26 @@ export class ScopedSeed
|
||||||
return [include, allowHash];
|
return [include, allowHash];
|
||||||
}
|
}
|
||||||
|
|
||||||
isAtMaxDepth(depth) {
|
isAtMaxDepth(depth: number) {
|
||||||
return depth >= this.maxDepth;
|
return depth >= this.maxDepth;
|
||||||
}
|
}
|
||||||
|
|
||||||
isIncluded(url, depth, extraHops = 0, logDetails = {}) {
|
isIncluded(url: string, depth: number, extraHops = 0, logDetails = {}) {
|
||||||
if (depth > this.maxDepth) {
|
if (depth > this.maxDepth) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
url = this.parseUrl(url, logDetails);
|
const urlParsed = this.parseUrl(url, logDetails);
|
||||||
if (!url) {
|
if (!urlParsed) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.allowHash) {
|
if (!this.allowHash) {
|
||||||
// remove hashtag
|
// remove hashtag
|
||||||
url.hash = "";
|
urlParsed.hash = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
url = url.href;
|
url = urlParsed.href;
|
||||||
|
|
||||||
if (url === this.url) {
|
if (url === this.url) {
|
||||||
return true;
|
return true;
|
||||||
|
@ -194,11 +225,11 @@ export class ScopedSeed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function rxEscape(string) {
|
export function rxEscape(string: string) {
|
||||||
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
||||||
}
|
}
|
||||||
|
|
||||||
export function urlRxEscape(url, parsedUrl) {
|
export function urlRxEscape(url: string, parsedUrl: URL) {
|
||||||
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,60 +1,144 @@
|
||||||
|
import { Redis, Result, Callback } from "ioredis";
|
||||||
|
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
import { MAX_DEPTH } from "./constants.js";
|
import { MAX_DEPTH } from "./constants.js";
|
||||||
|
import { ScopedSeed } from "./seeds.js";
|
||||||
|
import { Frame } from "puppeteer-core";
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export const LoadState = {
|
export enum LoadState {
|
||||||
FAILED: 0,
|
FAILED = 0,
|
||||||
CONTENT_LOADED: 1,
|
CONTENT_LOADED = 1,
|
||||||
FULL_PAGE_LOADED: 2,
|
FULL_PAGE_LOADED = 2,
|
||||||
EXTRACTION_DONE: 3,
|
EXTRACTION_DONE = 3,
|
||||||
BEHAVIORS_DONE: 4,
|
BEHAVIORS_DONE = 4,
|
||||||
};
|
}
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export const QueueState = {
|
export enum QueueState {
|
||||||
ADDED: 0,
|
ADDED = 0,
|
||||||
LIMIT_HIT: 1,
|
LIMIT_HIT = 1,
|
||||||
DUPE_URL: 2,
|
DUPE_URL = 2,
|
||||||
};
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
export type WorkerId = number;
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class PageState
|
export class PageState
|
||||||
{
|
{
|
||||||
constructor(redisData) {
|
url: string;
|
||||||
|
seedId: number;
|
||||||
|
depth: number;
|
||||||
|
extraHops: number;
|
||||||
|
|
||||||
|
workerid!: WorkerId;
|
||||||
|
|
||||||
|
pageid?: string;
|
||||||
|
title?: string;
|
||||||
|
mime?: string;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
callbacks: any;
|
||||||
|
|
||||||
|
isHTMLPage?: boolean;
|
||||||
|
text?: string;
|
||||||
|
favicon?: string;
|
||||||
|
|
||||||
|
skipBehaviors = false;
|
||||||
|
filteredFrames: Frame[] = [];
|
||||||
|
loadState : LoadState = LoadState.FAILED;
|
||||||
|
|
||||||
|
logDetails = {};
|
||||||
|
|
||||||
|
constructor(redisData: {url: string, seedId: number, depth: number, extraHops: number}) {
|
||||||
this.url = redisData.url;
|
this.url = redisData.url;
|
||||||
this.seedId = redisData.seedId;
|
this.seedId = redisData.seedId;
|
||||||
this.depth = redisData.depth;
|
this.depth = redisData.depth;
|
||||||
this.extraHops = redisData.extraHops;
|
this.extraHops = redisData.extraHops;
|
||||||
|
|
||||||
this.workerid = null;
|
|
||||||
this.pageid = null;
|
|
||||||
this.title = null;
|
|
||||||
|
|
||||||
this.isHTMLPage = null;
|
|
||||||
this.text = null;
|
|
||||||
|
|
||||||
this.skipBehaviors = false;
|
|
||||||
this.filteredFrames = [];
|
|
||||||
|
|
||||||
this.loadState = LoadState.FAILED;
|
|
||||||
this.logDetails = {};
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
declare module "ioredis" {
|
||||||
|
interface RedisCommander<Context> {
|
||||||
|
addqueue(
|
||||||
|
pkey: string,
|
||||||
|
qkey: string,
|
||||||
|
skey: string,
|
||||||
|
url: string,
|
||||||
|
score: number,
|
||||||
|
data: string,
|
||||||
|
limit: number,
|
||||||
|
): Result<number, Context>;
|
||||||
|
|
||||||
|
getnext(
|
||||||
|
qkey: string,
|
||||||
|
pkey: string,
|
||||||
|
): Result<string, Context>;
|
||||||
|
|
||||||
|
markstarted(
|
||||||
|
pkey: string,
|
||||||
|
pkeyUrl: string,
|
||||||
|
url: string,
|
||||||
|
started: string,
|
||||||
|
maxPageTime: number,
|
||||||
|
uid: string,
|
||||||
|
): Result<void, Context>;
|
||||||
|
|
||||||
|
movefailed(
|
||||||
|
pkey: string,
|
||||||
|
fkey: string,
|
||||||
|
url: string,
|
||||||
|
value: string,
|
||||||
|
state: string,
|
||||||
|
): Result<void, Context>;
|
||||||
|
|
||||||
|
unlockpending(
|
||||||
|
pkeyUrl: string,
|
||||||
|
uid: string,
|
||||||
|
callback?: Callback<string>
|
||||||
|
): Result<void, Context>;
|
||||||
|
|
||||||
|
requeue(
|
||||||
|
pkey: string,
|
||||||
|
qkey: string,
|
||||||
|
pkeyUrl: string,
|
||||||
|
url: string,
|
||||||
|
maxRetryPending: number,
|
||||||
|
): Result<number, Context>;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class RedisCrawlState
|
export class RedisCrawlState
|
||||||
{
|
{
|
||||||
constructor(redis, key, maxPageTime, uid) {
|
redis: Redis;
|
||||||
|
maxRetryPending = 1;
|
||||||
|
_lastSize = 0;
|
||||||
|
|
||||||
|
uid: string;
|
||||||
|
key: string;
|
||||||
|
maxPageTime: number;
|
||||||
|
|
||||||
|
qkey: string;
|
||||||
|
pkey: string;
|
||||||
|
skey: string;
|
||||||
|
dkey: string;
|
||||||
|
fkey: string;
|
||||||
|
ekey: string;
|
||||||
|
|
||||||
|
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
|
||||||
this.redis = redis;
|
this.redis = redis;
|
||||||
|
|
||||||
this.maxRetryPending = 1;
|
|
||||||
|
|
||||||
this._lastSize = 0;
|
|
||||||
|
|
||||||
this.uid = uid;
|
this.uid = uid;
|
||||||
this.key = key;
|
this.key = key;
|
||||||
|
@ -73,7 +157,7 @@ export class RedisCrawlState
|
||||||
this._initLuaCommands(this.redis);
|
this._initLuaCommands(this.redis);
|
||||||
}
|
}
|
||||||
|
|
||||||
_initLuaCommands(redis) {
|
_initLuaCommands(redis: Redis) {
|
||||||
redis.defineCommand("addqueue", {
|
redis.defineCommand("addqueue", {
|
||||||
numberOfKeys: 3,
|
numberOfKeys: 3,
|
||||||
lua: `
|
lua: `
|
||||||
|
@ -184,58 +268,58 @@ return 0;
|
||||||
return new Date().toISOString();
|
return new Date().toISOString();
|
||||||
}
|
}
|
||||||
|
|
||||||
async markStarted(url) {
|
async markStarted(url: string) {
|
||||||
const started = this._timestamp();
|
const started = this._timestamp();
|
||||||
|
|
||||||
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime, this.uid);
|
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime, this.uid);
|
||||||
}
|
}
|
||||||
|
|
||||||
async markFinished(url) {
|
async markFinished(url: string) {
|
||||||
await this.redis.hdel(this.pkey, url);
|
await this.redis.hdel(this.pkey, url);
|
||||||
|
|
||||||
return await this.redis.incr(this.dkey);
|
return await this.redis.incr(this.dkey);
|
||||||
}
|
}
|
||||||
|
|
||||||
async markFailed(url) {
|
async markFailed(url: string) {
|
||||||
await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
|
await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
|
||||||
|
|
||||||
return await this.redis.incr(this.dkey);
|
return await this.redis.incr(this.dkey);
|
||||||
}
|
}
|
||||||
|
|
||||||
async markExcluded(url) {
|
async markExcluded(url: string) {
|
||||||
await this.redis.hdel(this.pkey, url);
|
await this.redis.hdel(this.pkey, url);
|
||||||
|
|
||||||
await this.redis.srem(this.skey, url);
|
await this.redis.srem(this.skey, url);
|
||||||
}
|
}
|
||||||
|
|
||||||
recheckScope(data, seeds) {
|
recheckScope(data: {url: string, depth: number, extraHops: number, seedId: number}, seeds: ScopedSeed[]) {
|
||||||
const seed = seeds[data.seedId];
|
const seed = seeds[data.seedId];
|
||||||
|
|
||||||
return seed.isIncluded(data.url, data.depth, data.extraHops);
|
return seed.isIncluded(data.url, data.depth, data.extraHops);
|
||||||
}
|
}
|
||||||
|
|
||||||
async isFinished() {
|
async isFinished() {
|
||||||
return (await this.queueSize() == 0) && (await this.numDone() > 0);
|
return ((await this.queueSize()) == 0) && ((await this.numDone()) > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
async setStatus(status_) {
|
async setStatus(status_: string) {
|
||||||
await this.redis.hset(`${this.key}:status`, this.uid, status_);
|
await this.redis.hset(`${this.key}:status`, this.uid, status_);
|
||||||
}
|
}
|
||||||
|
|
||||||
async getStatus() {
|
async getStatus() : Promise<string> {
|
||||||
return await this.redis.hget(`${this.key}:status`, this.uid);
|
return (await this.redis.hget(`${this.key}:status`, this.uid)) || "";
|
||||||
}
|
}
|
||||||
|
|
||||||
async setArchiveSize(size) {
|
async setArchiveSize(size: number) {
|
||||||
return await this.redis.hset(`${this.key}:size`, this.uid, size);
|
return await this.redis.hset(`${this.key}:size`, this.uid, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
async isCrawlStopped() {
|
async isCrawlStopped() {
|
||||||
if (await this.redis.get(`${this.key}:stopping`) === "1") {
|
if ((await this.redis.get(`${this.key}:stopping`)) === "1") {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (await this.redis.hget(`${this.key}:stopone`, this.uid) === "1") {
|
if ((await this.redis.hget(`${this.key}:stopone`, this.uid)) === "1") {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -243,7 +327,7 @@ return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
async isCrawlCanceled() {
|
async isCrawlCanceled() {
|
||||||
return await this.redis.get(`${this.key}:canceled`) === "1";
|
return (await this.redis.get(`${this.key}:canceled`)) === "1";
|
||||||
}
|
}
|
||||||
|
|
||||||
// note: not currently called in crawler, but could be
|
// note: not currently called in crawler, but could be
|
||||||
|
@ -252,7 +336,7 @@ return 0;
|
||||||
await this.redis.set(`${this.key}:stopping`, "1");
|
await this.redis.set(`${this.key}:stopping`, "1");
|
||||||
}
|
}
|
||||||
|
|
||||||
async processMessage(seeds) {
|
async processMessage(seeds: ScopedSeed[]) {
|
||||||
while (true) {
|
while (true) {
|
||||||
const result = await this.redis.lpop(`${this.uid}:msg`);
|
const result = await this.redis.lpop(`${this.uid}:msg`);
|
||||||
if (!result) {
|
if (!result) {
|
||||||
|
@ -285,18 +369,20 @@ return 0;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} // TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
catch (e: any) {
|
||||||
logger.warn("Error processing message", e, "redisMessage");
|
logger.warn("Error processing message", e, "redisMessage");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
isStrMatch(s) {
|
isStrMatch(s: string) {
|
||||||
// if matches original string, then consider not a regex
|
// if matches original string, then consider not a regex
|
||||||
return s.replace(/\\/g, "").replace(/[\\^$*+?.()|[\]{}]/g, "\\$&") === s;
|
return s.replace(/\\/g, "").replace(/[\\^$*+?.()|[\]{}]/g, "\\$&") === s;
|
||||||
}
|
}
|
||||||
|
|
||||||
filterQueue(regexStr) {
|
filterQueue(regexStr: string) {
|
||||||
const regex = new RegExp(regexStr);
|
const regex = new RegExp(regexStr);
|
||||||
|
|
||||||
let matcher = undefined;
|
let matcher = undefined;
|
||||||
|
@ -325,7 +411,7 @@ return 0;
|
||||||
stream.resume();
|
stream.resume();
|
||||||
});
|
});
|
||||||
|
|
||||||
return new Promise(resolve => {
|
return new Promise<void>(resolve => {
|
||||||
stream.on("end", () => {
|
stream.on("end", () => {
|
||||||
resolve();
|
resolve();
|
||||||
});
|
});
|
||||||
|
@ -341,9 +427,12 @@ return 0;
|
||||||
return (res >= 3);
|
return (res >= 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
async addToQueue({url, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
||||||
|
async addToQueue({url, seedId, depth = 0, extraHops = 0} : {url: string, seedId: number, depth?: number, extraHops?: number}, limit = 0) {
|
||||||
const added = this._timestamp();
|
const added = this._timestamp();
|
||||||
const data = {added, url, seedId, depth};
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const data : any = {added, url, seedId, depth};
|
||||||
if (extraHops) {
|
if (extraHops) {
|
||||||
data.extraHops = extraHops;
|
data.extraHops = extraHops;
|
||||||
}
|
}
|
||||||
|
@ -375,8 +464,8 @@ return 0;
|
||||||
return new PageState(data);
|
return new PageState(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
async has(url) {
|
async has(url: string) {
|
||||||
return !!await this.redis.sismember(this.skey, url);
|
return !!(await this.redis.sismember(this.skey, url));
|
||||||
}
|
}
|
||||||
|
|
||||||
async serialize() {
|
async serialize() {
|
||||||
|
@ -390,25 +479,25 @@ return 0;
|
||||||
return {done, queued, pending, failed, errors};
|
return {done, queued, pending, failed, errors};
|
||||||
}
|
}
|
||||||
|
|
||||||
_getScore(data) {
|
_getScore(data: {depth: number, extraHops: number}) {
|
||||||
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
|
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
|
||||||
}
|
}
|
||||||
|
|
||||||
async _iterSortedKey(key, inc = 100) {
|
async _iterSortedKey(key: string, inc = 100) {
|
||||||
const results = [];
|
const results : string[] = [];
|
||||||
|
|
||||||
const len = await this.redis.zcard(key);
|
const len = await this.redis.zcard(key);
|
||||||
|
|
||||||
for (let i = 0; i < len; i += inc) {
|
for (let i = 0; i < len; i += inc) {
|
||||||
const someResults = await this.redis.zrangebyscore(key, 0, "inf", "limit", i, inc);
|
const someResults = await this.redis.zrangebyscore(key, 0, "inf", "LIMIT", i, inc);
|
||||||
results.push(...someResults);
|
results.push(...someResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
async _iterListKeys(key, inc = 100) {
|
async _iterListKeys(key: string, inc = 100) {
|
||||||
const results = [];
|
const results : string[] = [];
|
||||||
|
|
||||||
const len = await this.redis.llen(key);
|
const len = await this.redis.llen(key);
|
||||||
|
|
||||||
|
@ -419,8 +508,10 @@ return 0;
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
async load(state, seeds, checkScope) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
const seen = [];
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
async load(state: Record<string, any>, seeds: ScopedSeed[], checkScope: boolean) {
|
||||||
|
const seen : string[] = [];
|
||||||
|
|
||||||
// need to delete existing keys, if exist to fully reset state
|
// need to delete existing keys, if exist to fully reset state
|
||||||
await this.redis.del(this.qkey);
|
await this.redis.del(this.qkey);
|
||||||
|
@ -486,7 +577,7 @@ return 0;
|
||||||
|
|
||||||
async numDone() {
|
async numDone() {
|
||||||
const done = await this.redis.get(this.dkey);
|
const done = await this.redis.get(this.dkey);
|
||||||
return parseInt(done);
|
return parseInt(done || "0");
|
||||||
}
|
}
|
||||||
|
|
||||||
async numSeen() {
|
async numSeen() {
|
||||||
|
@ -524,7 +615,9 @@ return 0;
|
||||||
for (const url of pendingUrls) {
|
for (const url of pendingUrls) {
|
||||||
await this.redis.unlockpending(this.pkey + ":" + url, this.uid);
|
await this.redis.unlockpending(this.pkey + ":" + url, this.uid);
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} // TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
catch (e: any) {
|
||||||
logger.error("Redis Del Pending Failed", e, "state");
|
logger.error("Redis Del Pending Failed", e, "state");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -551,15 +644,15 @@ return 0;
|
||||||
return this._lastSize;
|
return this._lastSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
async addIfNoDupe(key, value) {
|
async addIfNoDupe(key: string, value: string) {
|
||||||
return await this.redis.sadd(key, value) === 1;
|
return (await this.redis.sadd(key, value)) === 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
async removeDupe(key, value) {
|
async removeDupe(key: string, value: string) {
|
||||||
return await this.redis.srem(key, value);
|
return await this.redis.srem(key, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
async logError(error) {
|
async logError(error: string) {
|
||||||
return await this.redis.lpush(this.ekey, error);
|
return await this.redis.lpush(this.ekey, error);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -5,20 +5,40 @@ import util from "util";
|
||||||
|
|
||||||
import os from "os";
|
import os from "os";
|
||||||
import { createHash } from "crypto";
|
import { createHash } from "crypto";
|
||||||
|
|
||||||
import crc32 from "crc/crc32";
|
import crc32 from "crc/crc32";
|
||||||
|
|
||||||
import Minio from "minio";
|
import * as Minio from "minio";
|
||||||
|
|
||||||
import { initRedis } from "./redis.js";
|
import { initRedis } from "./redis.js";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
|
// @ts-expect-error TODO fill in why error is expected
|
||||||
import getFolderSize from "get-folder-size";
|
import getFolderSize from "get-folder-size";
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class S3StorageSync
|
export class S3StorageSync
|
||||||
{
|
{
|
||||||
constructor(urlOrData, {webhookUrl, userId, crawlId} = {}) {
|
fullPrefix: string;
|
||||||
|
client: Minio.Client;
|
||||||
|
|
||||||
|
bucketName: string;
|
||||||
|
objectPrefix: string;
|
||||||
|
resources: object[] = [];
|
||||||
|
|
||||||
|
userId: string;
|
||||||
|
crawlId: string;
|
||||||
|
webhookUrl?: string;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
urlOrData: string | any,
|
||||||
|
{webhookUrl, userId, crawlId} :
|
||||||
|
{webhookUrl?: string, userId: string, crawlId: string}
|
||||||
|
) {
|
||||||
let url;
|
let url;
|
||||||
let accessKey;
|
let accessKey;
|
||||||
let secretKey;
|
let secretKey;
|
||||||
|
@ -47,8 +67,6 @@ export class S3StorageSync
|
||||||
partSize: 100*1024*1024
|
partSize: 100*1024*1024
|
||||||
});
|
});
|
||||||
|
|
||||||
this.client.enableSHA256 = true;
|
|
||||||
|
|
||||||
this.bucketName = url.pathname.slice(1).split("/")[0];
|
this.bucketName = url.pathname.slice(1).split("/")[0];
|
||||||
|
|
||||||
this.objectPrefix = url.pathname.slice(this.bucketName.length + 2);
|
this.objectPrefix = url.pathname.slice(this.bucketName.length + 2);
|
||||||
|
@ -60,12 +78,12 @@ export class S3StorageSync
|
||||||
this.webhookUrl = webhookUrl;
|
this.webhookUrl = webhookUrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
async uploadFile(srcFilename, targetFilename) {
|
async uploadFile(srcFilename: string, targetFilename: string) {
|
||||||
const fileUploadInfo = {
|
const fileUploadInfo = {
|
||||||
"bucket": this.bucketName,
|
"bucket": this.bucketName,
|
||||||
"crawlId": this.crawlId,
|
"crawlId": this.crawlId,
|
||||||
"prefix": this.objectPrefix,
|
"prefix": this.objectPrefix,
|
||||||
"targetFilename": this.targetFilename
|
targetFilename
|
||||||
};
|
};
|
||||||
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
|
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
|
||||||
|
|
||||||
|
@ -80,13 +98,13 @@ export class S3StorageSync
|
||||||
return {path, size, hash, crc32, bytes: size};
|
return {path, size, hash, crc32, bytes: size};
|
||||||
}
|
}
|
||||||
|
|
||||||
async downloadFile(srcFilename, destFilename) {
|
async downloadFile(srcFilename: string, destFilename: string) {
|
||||||
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
||||||
}
|
}
|
||||||
|
|
||||||
async uploadCollWACZ(srcFilename, targetFilename, completed = true) {
|
async uploadCollWACZ(srcFilename: string, targetFilename: string, completed = true) {
|
||||||
const resource = await this.uploadFile(srcFilename, targetFilename);
|
const resource = await this.uploadFile(srcFilename, targetFilename);
|
||||||
logger.info("WACZ S3 file upload resource", {...targetFilename, resource}, "s3Upload");
|
logger.info("WACZ S3 file upload resource", {targetFilename, resource}, "s3Upload");
|
||||||
|
|
||||||
if (this.webhookUrl) {
|
if (this.webhookUrl) {
|
||||||
const body = {
|
const body = {
|
||||||
|
@ -130,8 +148,8 @@ export function initStorage() {
|
||||||
|
|
||||||
const opts = {
|
const opts = {
|
||||||
crawlId: process.env.CRAWL_ID || os.hostname(),
|
crawlId: process.env.CRAWL_ID || os.hostname(),
|
||||||
webhookUrl: process.env.WEBHOOK_URL,
|
webhookUrl: process.env.WEBHOOK_URL || "",
|
||||||
userId: process.env.STORE_USER,
|
userId: process.env.STORE_USER || "",
|
||||||
};
|
};
|
||||||
|
|
||||||
logger.info("Initing Storage...");
|
logger.info("Initing Storage...");
|
||||||
|
@ -139,12 +157,12 @@ export function initStorage() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export async function getFileSize(filename) {
|
export async function getFileSize(filename: string) {
|
||||||
const stats = await fsp.stat(filename);
|
const stats = await fsp.stat(filename);
|
||||||
return stats.size;
|
return stats.size;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDirSize(dir) {
|
export async function getDirSize(dir: string) {
|
||||||
const { size, errors } = await getFolderSize(dir);
|
const { size, errors } = await getFolderSize(dir);
|
||||||
if (errors && errors.length) {
|
if (errors && errors.length) {
|
||||||
logger.warn("Size check errors", {errors}, "sizecheck");
|
logger.warn("Size check errors", {errors}, "sizecheck");
|
||||||
|
@ -152,8 +170,10 @@ export async function getDirSize(dir) {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function checkDiskUtilization(params, archiveDirSize, dfOutput=null) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
const diskUsage = await getDiskUsage("/crawls", dfOutput);
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
export async function checkDiskUtilization(params: Record<string, any>, archiveDirSize: number, dfOutput=null) {
|
||||||
|
const diskUsage : Record<string, string> = await getDiskUsage("/crawls", dfOutput);
|
||||||
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
||||||
|
|
||||||
// Check that disk usage isn't already above threshold
|
// Check that disk usage isn't already above threshold
|
||||||
|
@ -199,19 +219,21 @@ export async function checkDiskUtilization(params, archiveDirSize, dfOutput=null
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDFOutput(path) {
|
export async function getDFOutput(path: string) {
|
||||||
const exec = util.promisify(child_process.exec);
|
const exec = util.promisify(child_process.exec);
|
||||||
const res = await exec(`df ${path}`);
|
const res = await exec(`df ${path}`);
|
||||||
return res.stdout;
|
return res.stdout;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
||||||
const result = dfOutput || await getDFOutput(path);
|
const result = dfOutput || (await getDFOutput(path));
|
||||||
const lines = result.split("\n");
|
const lines = result.split("\n");
|
||||||
const keys = lines[0].split(/\s+/ig);
|
const keys = lines[0].split(/\s+/ig);
|
||||||
const rows = lines.slice(1).map(line => {
|
const rows = lines.slice(1).map(line => {
|
||||||
const values = line.split(/\s+/ig);
|
const values = line.split(/\s+/ig);
|
||||||
return keys.reduce((o, k, index) => {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
return keys.reduce((o: Record<string, any>, k, index) => {
|
||||||
o[k] = values[index];
|
o[k] = values[index];
|
||||||
return o;
|
return o;
|
||||||
}, {});
|
}, {});
|
||||||
|
@ -219,14 +241,14 @@ export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
||||||
return rows[0];
|
return rows[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
export function calculatePercentageUsed(used, total) {
|
export function calculatePercentageUsed(used: number, total: number) {
|
||||||
return Math.round((used/total) * 100);
|
return Math.round((used/total) * 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
function checksumFile(hashName, path) {
|
function checksumFile(hashName: string, path: string) : Promise<{hash: string, crc32: number}>{
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
const hash = createHash(hashName);
|
const hash = createHash(hashName);
|
||||||
let crc = null;
|
let crc : number = 0;
|
||||||
|
|
||||||
const stream = fs.createReadStream(path);
|
const stream = fs.createReadStream(path);
|
||||||
stream.on("error", err => reject(err));
|
stream.on("error", err => reject(err));
|
||||||
|
@ -238,7 +260,7 @@ function checksumFile(hashName, path) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export function interpolateFilename(filename, crawlId) {
|
export function interpolateFilename(filename: string, crawlId: string) {
|
||||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
||||||
filename = filename.replace("@hostname", os.hostname());
|
filename = filename.replace("@hostname", os.hostname());
|
||||||
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
|
@ -1,16 +1,21 @@
|
||||||
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
import { CDPSession, Protocol } from "puppeteer-core";
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class BaseTextExtract extends WARCResourceWriter {
|
export abstract class BaseTextExtract extends WARCResourceWriter {
|
||||||
constructor(cdp, opts) {
|
cdp: CDPSession;
|
||||||
|
lastText: string | null = null;
|
||||||
|
text: string | null = null;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
constructor(cdp: CDPSession, opts: any) {
|
||||||
super({...opts, warcName: "text.warc.gz"});
|
super({...opts, warcName: "text.warc.gz"});
|
||||||
this.cdp = cdp;
|
this.cdp = cdp;
|
||||||
this.lastText = null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async extractAndStoreText(resourceType, ignoreIfMatchesLast = false, saveToWarc = false) {
|
async extractAndStoreText(resourceType: string, ignoreIfMatchesLast = false, saveToWarc = false) {
|
||||||
try {
|
try {
|
||||||
const text = await this.doGetText();
|
const text = await this.doGetText();
|
||||||
|
|
||||||
|
@ -26,26 +31,26 @@ export class BaseTextExtract extends WARCResourceWriter {
|
||||||
|
|
||||||
this.lastText = text;
|
this.lastText = text;
|
||||||
return {changed: true, text};
|
return {changed: true, text};
|
||||||
} catch (e) {
|
} // TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
catch (e: any) {
|
||||||
logger.debug("Error extracting text", e, "text");
|
logger.debug("Error extracting text", e, "text");
|
||||||
return {changed: false, text: null};
|
return {changed: false, text: null};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async doGetText() {
|
abstract doGetText() : Promise<string>;
|
||||||
throw new Error("unimplemented");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class TextExtractViaSnapshot extends BaseTextExtract {
|
export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||||
async doGetText() {
|
async doGetText() : Promise<string> {
|
||||||
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
|
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
|
||||||
return this.parseTextFromDOMSnapshot(result);
|
return this.parseTextFromDOMSnapshot(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
parseTextFromDOMSnapshot(result) {
|
parseTextFromDOMSnapshot(result: Protocol.DOMSnapshot.CaptureSnapshotResponse) : string {
|
||||||
const TEXT_NODE = 3;
|
const TEXT_NODE = 3;
|
||||||
const ELEMENT_NODE = 1;
|
const ELEMENT_NODE = 1;
|
||||||
|
|
||||||
|
@ -53,13 +58,13 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||||
|
|
||||||
const {strings, documents} = result;
|
const {strings, documents} = result;
|
||||||
|
|
||||||
const accum = [];
|
const accum : string[] = [];
|
||||||
|
|
||||||
for (const doc of documents) {
|
for (const doc of documents) {
|
||||||
const nodeValues = doc.nodes.nodeValue;
|
const nodeValues = doc.nodes.nodeValue || [];
|
||||||
const nodeNames = doc.nodes.nodeName;
|
const nodeNames = doc.nodes.nodeName || [];
|
||||||
const nodeTypes = doc.nodes.nodeType;
|
const nodeTypes = doc.nodes.nodeType || [];
|
||||||
const parentIndex = doc.nodes.parentIndex;
|
const parentIndex = doc.nodes.parentIndex || [];
|
||||||
|
|
||||||
for (let i = 0; i < nodeValues.length; i++) {
|
for (let i = 0; i < nodeValues.length; i++) {
|
||||||
if (nodeValues[i] === -1) {
|
if (nodeValues[i] === -1) {
|
||||||
|
@ -74,28 +79,28 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||||
if (!SKIPPED_NODES.includes(name)) {
|
if (!SKIPPED_NODES.includes(name)) {
|
||||||
const value = strings[nodeValues[i]].trim();
|
const value = strings[nodeValues[i]].trim();
|
||||||
if (value) {
|
if (value) {
|
||||||
accum.push(value);
|
accum.push(value as string);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return accum.join("\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return accum.join("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class TextExtractViaDocument extends BaseTextExtract {
|
export class TextExtractViaDocument extends BaseTextExtract {
|
||||||
async doGetText() {
|
async doGetText() : Promise<string> {
|
||||||
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||||
return this.parseTextFromDOM(result);
|
return this.parseTextFromDOM(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
async parseTextFromDom(dom) {
|
parseTextFromDOM(dom: Protocol.DOM.GetDocumentResponse) : string {
|
||||||
const accum = [];
|
const accum : string[] = [];
|
||||||
const metadata = {};
|
const metadata = {};
|
||||||
|
|
||||||
this.parseText(dom.root, metadata, accum);
|
this.parseText(dom.root, metadata, accum);
|
||||||
|
@ -103,9 +108,9 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
||||||
return accum.join("\n");
|
return accum.join("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
async parseText(node, metadata, accum) {
|
parseText(node: Protocol.DOM.Node, metadata: Record<string, string> | null, accum: string[]) {
|
||||||
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
||||||
const EMPTY_LIST = [];
|
const EMPTY_LIST : Protocol.DOM.Node[] = [];
|
||||||
const TEXT = "#text";
|
const TEXT = "#text";
|
||||||
const TITLE = "title";
|
const TITLE = "title";
|
||||||
|
|
||||||
|
@ -123,9 +128,9 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
||||||
accum.push(value);
|
accum.push(value);
|
||||||
}
|
}
|
||||||
} else if (name === TITLE) {
|
} else if (name === TITLE) {
|
||||||
const title = [];
|
const title : string[] = [];
|
||||||
|
|
||||||
for (let child of children) {
|
for (const child of children) {
|
||||||
this.parseText(child, null, title);
|
this.parseText(child, null, title);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,7 +140,7 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
||||||
accum.push(title.join(" "));
|
accum.push(title.join(" "));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (let child of children) {
|
for (const child of children) {
|
||||||
this.parseText(child, metadata, accum);
|
this.parseText(child, metadata, accum);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,14 +1,24 @@
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
export function sleep(seconds) {
|
export function sleep(seconds: number) {
|
||||||
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
||||||
}
|
}
|
||||||
|
|
||||||
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
|
||||||
|
export function timedRun(
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
promise: Promise<any>,
|
||||||
|
seconds: number,
|
||||||
|
message="Promise timed out",
|
||||||
|
logDetails={},
|
||||||
|
context="general",
|
||||||
|
isWarn=false
|
||||||
|
) {
|
||||||
// return Promise return value or log error if timeout is reached first
|
// return Promise return value or log error if timeout is reached first
|
||||||
const timeout = seconds * 1000;
|
const timeout = seconds * 1000;
|
||||||
|
|
||||||
const rejectPromiseOnTimeout = (timeout) => {
|
const rejectPromiseOnTimeout = (timeout: number) => {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
setTimeout(() => (reject("timeout reached")), timeout);
|
setTimeout(() => (reject("timeout reached")), timeout);
|
||||||
});
|
});
|
||||||
|
@ -26,7 +36,7 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export function secondsElapsed(startTime, nowDate = null) {
|
export function secondsElapsed(startTime: number, nowDate: Date | null = null) {
|
||||||
nowDate = nowDate || new Date();
|
nowDate = nowDate || new Date();
|
||||||
|
|
||||||
return (nowDate.getTime() - startTime) / 1000;
|
return (nowDate.getTime() - startTime) / 1000;
|
|
@ -4,27 +4,35 @@ import * as warcio from "warcio";
|
||||||
|
|
||||||
export class WARCResourceWriter
|
export class WARCResourceWriter
|
||||||
{
|
{
|
||||||
constructor({url, directory, date, warcName}) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
page: any;
|
||||||
|
url: string;
|
||||||
|
directory: string;
|
||||||
|
warcName: string;
|
||||||
|
date: Date;
|
||||||
|
|
||||||
|
constructor({url, directory, date, warcName} : {url: string, directory: string, date: Date, warcName: string}) {
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.directory = directory;
|
this.directory = directory;
|
||||||
this.warcName = path.join(this.directory, warcName);
|
this.warcName = path.join(this.directory, warcName);
|
||||||
this.date = date ? date : new Date();
|
this.date = date ? date : new Date();
|
||||||
}
|
}
|
||||||
|
|
||||||
async writeBufferToWARC(contents, resourceType, contentType) {
|
async writeBufferToWARC(contents: Uint8Array, resourceType: string, contentType: string) {
|
||||||
const warcRecord = await this.wrap(contents, resourceType, contentType);
|
const warcRecord = await this.wrap(contents, resourceType, contentType);
|
||||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
||||||
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
async wrap(buffer, resourceType, contentType) {
|
async wrap(buffer: Uint8Array, resourceType: string, contentType: string) {
|
||||||
const warcVersion = "WARC/1.1";
|
const warcVersion = "WARC/1.1";
|
||||||
const warcRecordType = "resource";
|
const warcRecordType = "resource";
|
||||||
const warcHeaders = {"Content-Type": contentType};
|
const warcHeaders = {"Content-Type": contentType};
|
||||||
async function* content() {
|
async function* content() {
|
||||||
yield buffer;
|
yield buffer;
|
||||||
}
|
}
|
||||||
let resourceUrl = `urn:${resourceType}:${this.url}`;
|
const resourceUrl = `urn:${resourceType}:${this.url}`;
|
||||||
|
|
||||||
return warcio.WARCRecord.create({
|
return warcio.WARCRecord.create({
|
||||||
url: resourceUrl,
|
url: resourceUrl,
|
|
@ -1,15 +1,32 @@
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
import { Writable } from "stream";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
|
|
||||||
import { CDXIndexer } from "warcio";
|
import { CDXIndexer } from "warcio";
|
||||||
import { WARCSerializer } from "warcio/node";
|
import { WARCSerializer } from "warcio/node";
|
||||||
import { logger, errJSON } from "./logger.js";
|
import { logger, errJSON } from "./logger.js";
|
||||||
|
import type { IndexerOffsetLength, WARCRecord } from "warcio";
|
||||||
|
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
export class WARCWriter
|
export class WARCWriter implements IndexerOffsetLength
|
||||||
{
|
{
|
||||||
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails}) {
|
archivesDir: string;
|
||||||
|
tempCdxDir: string;
|
||||||
|
filename: string;
|
||||||
|
gzip: boolean;
|
||||||
|
logDetails: Record<string, string>;
|
||||||
|
|
||||||
|
offset = 0;
|
||||||
|
recordLength = 0;
|
||||||
|
|
||||||
|
indexer?: CDXIndexer;
|
||||||
|
|
||||||
|
fh?: Writable | null;
|
||||||
|
cdxFH?: Writable | null;
|
||||||
|
|
||||||
|
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails} :
|
||||||
|
{archivesDir: string, tempCdxDir: string, filename: string, gzip: boolean, logDetails: Record<string, string>}) {
|
||||||
this.archivesDir = archivesDir;
|
this.archivesDir = archivesDir;
|
||||||
this.tempCdxDir = tempCdxDir;
|
this.tempCdxDir = tempCdxDir;
|
||||||
this.filename = filename;
|
this.filename = filename;
|
||||||
|
@ -21,12 +38,7 @@ export class WARCWriter
|
||||||
|
|
||||||
if (this.tempCdxDir) {
|
if (this.tempCdxDir) {
|
||||||
this.indexer = new CDXIndexer({format: "cdxj"});
|
this.indexer = new CDXIndexer({format: "cdxj"});
|
||||||
} else {
|
|
||||||
this.indexer = null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
this.fh = null;
|
|
||||||
this.cdxFH = null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async initFH() {
|
async initFH() {
|
||||||
|
@ -38,7 +50,7 @@ export class WARCWriter
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async writeRecordPair(responseRecord, requestRecord, responseSerializer = null) {
|
async writeRecordPair(responseRecord: WARCRecord, requestRecord: WARCRecord, responseSerializer: WARCSerializer | undefined = undefined) {
|
||||||
const opts = {gzip: this.gzip};
|
const opts = {gzip: this.gzip};
|
||||||
|
|
||||||
if (!responseSerializer) {
|
if (!responseSerializer) {
|
||||||
|
@ -58,10 +70,14 @@ export class WARCWriter
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async _writeRecord(record, serializer) {
|
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
|
||||||
let total = 0;
|
let total = 0;
|
||||||
const url = record.warcTargetURI;
|
const url = record.warcTargetURI;
|
||||||
|
|
||||||
|
if (!this.fh) {
|
||||||
|
throw new Error("writer not initialized");
|
||||||
|
}
|
||||||
|
|
||||||
for await (const chunk of serializer) {
|
for await (const chunk of serializer) {
|
||||||
total += chunk.length;
|
total += chunk.length;
|
||||||
try {
|
try {
|
||||||
|
@ -74,12 +90,12 @@ export class WARCWriter
|
||||||
return total;
|
return total;
|
||||||
}
|
}
|
||||||
|
|
||||||
_writeCDX(record) {
|
_writeCDX(record: WARCRecord | null) {
|
||||||
if (this.indexer) {
|
if (this.indexer) {
|
||||||
const cdx = this.indexer.indexRecord(record, this, this.filename);
|
const cdx = this.indexer.indexRecord(record, this, this.filename);
|
||||||
|
|
||||||
if (this.indexer && this.cdxFH && cdx) {
|
if (this.indexer && this.cdxFH && cdx) {
|
||||||
this.indexer.write(cdx, this.cdxFH);
|
this.indexer.write(cdx, this.cdxFH as NodeJS.WriteStream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,8 +118,8 @@ export class WARCWriter
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
export function streamFinish(fh) {
|
export function streamFinish(fh: Writable) {
|
||||||
const p = new Promise(resolve => {
|
const p = new Promise<void>(resolve => {
|
||||||
fh.once("finish", () => resolve());
|
fh.once("finish", () => resolve());
|
||||||
});
|
});
|
||||||
fh.end();
|
fh.end();
|
|
@ -6,6 +6,8 @@ import { logger, errJSON } from "./logger.js";
|
||||||
import { sleep, timedRun } from "./timing.js";
|
import { sleep, timedRun } from "./timing.js";
|
||||||
import { Recorder } from "./recorder.js";
|
import { Recorder } from "./recorder.js";
|
||||||
import { rxEscape } from "./seeds.js";
|
import { rxEscape } from "./seeds.js";
|
||||||
|
import { CDPSession, Page } from "puppeteer-core";
|
||||||
|
import { PageState, WorkerId } from "./state.js";
|
||||||
|
|
||||||
const MAX_REUSE = 5;
|
const MAX_REUSE = 5;
|
||||||
|
|
||||||
|
@ -14,7 +16,9 @@ const TEARDOWN_TIMEOUT = 10;
|
||||||
const FINISHED_TIMEOUT = 60;
|
const FINISHED_TIMEOUT = 60;
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export function runWorkers(crawler, numWorkers, maxPageTime, collDir) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number, collDir: string) {
|
||||||
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
||||||
|
|
||||||
const workers = [];
|
const workers = [];
|
||||||
|
@ -29,40 +33,73 @@ export function runWorkers(crawler, numWorkers, maxPageTime, collDir) {
|
||||||
const rx = new RegExp(rxEscape(process.env.CRAWL_ID) + "\\-([\\d]+)$");
|
const rx = new RegExp(rxEscape(process.env.CRAWL_ID) + "\\-([\\d]+)$");
|
||||||
const m = os.hostname().match(rx);
|
const m = os.hostname().match(rx);
|
||||||
if (m) {
|
if (m) {
|
||||||
offset = m[1] * numWorkers;
|
offset = Number(m[1]) * numWorkers;
|
||||||
logger.info("Starting workerid index at " + offset, "worker");
|
logger.info("Starting workerid index at " + offset, "worker");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let i = 0; i < numWorkers; i++) {
|
for (let i = 0; i < numWorkers; i++) {
|
||||||
workers.push(new PageWorker(i + offset, crawler, maxPageTime, collDir));
|
workers.push(new PageWorker((i + offset), crawler, maxPageTime, collDir));
|
||||||
}
|
}
|
||||||
|
|
||||||
return Promise.allSettled(workers.map((worker) => worker.run()));
|
return Promise.allSettled(workers.map((worker) => worker.run()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// ===========================================================================
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
export type WorkerOpts = Record<string, any> & {
|
||||||
|
page: Page;
|
||||||
|
cdp: CDPSession;
|
||||||
|
workerid: WorkerId;
|
||||||
|
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||||
|
callbacks: Record<string, Function>;
|
||||||
|
directFetchCapture?: ((url: string) => Promise<{fetched: boolean, mime: string}>) | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ===========================================================================
|
||||||
|
export type WorkerState = WorkerOpts & {
|
||||||
|
data: PageState
|
||||||
|
};
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class PageWorker
|
export class PageWorker
|
||||||
{
|
{
|
||||||
constructor(id, crawler, maxPageTime, collDir) {
|
id: WorkerId;
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
crawler: any;
|
||||||
|
maxPageTime: number;
|
||||||
|
|
||||||
|
reuseCount = 0;
|
||||||
|
page?: Page | null;
|
||||||
|
cdp?: CDPSession | null;
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||||
|
callbacks?: Record<string, Function>;
|
||||||
|
|
||||||
|
opts?: WorkerOpts;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
logDetails: Record<string, any> = {};
|
||||||
|
|
||||||
|
crashed = false;
|
||||||
|
markCrashed?: (reason: string) => void;
|
||||||
|
crashBreak?: Promise<void>;
|
||||||
|
|
||||||
|
recorder: Recorder;
|
||||||
|
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
constructor(id: WorkerId, crawler: any, maxPageTime: number, collDir: string) {
|
||||||
this.id = id;
|
this.id = id;
|
||||||
this.crawler = crawler;
|
this.crawler = crawler;
|
||||||
this.maxPageTime = maxPageTime;
|
this.maxPageTime = maxPageTime;
|
||||||
|
|
||||||
this.reuseCount = 0;
|
|
||||||
this.page = null;
|
|
||||||
this.cdp = null;
|
|
||||||
this.callbacks = null;
|
|
||||||
|
|
||||||
this.opts = null;
|
|
||||||
|
|
||||||
this.logDetails = {workerid: this.id};
|
this.logDetails = {workerid: this.id};
|
||||||
|
|
||||||
this.crashed = false;
|
|
||||||
this.markCrashed = null;
|
|
||||||
this.crashBreak = null;
|
|
||||||
|
|
||||||
this.recorder = new Recorder({workerid: id, collDir, crawler: this.crawler});
|
this.recorder = new Recorder({workerid: id, collDir, crawler: this.crawler});
|
||||||
|
|
||||||
this.crawler.browser.recorders.push(this.recorder);
|
this.crawler.browser.recorders.push(this.recorder);
|
||||||
|
@ -108,9 +145,9 @@ export class PageWorker
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
isSameOrigin(url) {
|
isSameOrigin(url: string) {
|
||||||
try {
|
try {
|
||||||
const currURL = new URL(this.page.url());
|
const currURL = new URL(this.page ? this.page.url() : "");
|
||||||
const newURL = new URL(url);
|
const newURL = new URL(url);
|
||||||
return currURL.origin === newURL.origin;
|
return currURL.origin === newURL.origin;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -118,8 +155,8 @@ export class PageWorker
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async initPage(url) {
|
async initPage(url: string) : Promise<WorkerOpts> {
|
||||||
if (!this.crashed && this.page && ++this.reuseCount <= MAX_REUSE && this.isSameOrigin(url)) {
|
if (!this.crashed && this.page && this.opts && ++this.reuseCount <= MAX_REUSE && this.isSameOrigin(url)) {
|
||||||
logger.debug("Reusing page", {reuseCount: this.reuseCount, ...this.logDetails}, "worker");
|
logger.debug("Reusing page", {reuseCount: this.reuseCount, ...this.logDetails}, "worker");
|
||||||
return this.opts;
|
return this.opts;
|
||||||
} else if (this.page) {
|
} else if (this.page) {
|
||||||
|
@ -151,10 +188,10 @@ export class PageWorker
|
||||||
this.page = page;
|
this.page = page;
|
||||||
this.cdp = cdp;
|
this.cdp = cdp;
|
||||||
this.callbacks = {};
|
this.callbacks = {};
|
||||||
const directFetchCapture = this.recorder ? (x) => this.recorder.directFetchCapture(x) : null;
|
const directFetchCapture = this.recorder ? (x: string) => this.recorder.directFetchCapture(x) : null;
|
||||||
this.opts = {
|
this.opts = {
|
||||||
page: this.page,
|
page,
|
||||||
cdp: this.cdp,
|
cdp,
|
||||||
workerid,
|
workerid,
|
||||||
callbacks: this.callbacks,
|
callbacks: this.callbacks,
|
||||||
directFetchCapture,
|
directFetchCapture,
|
||||||
|
@ -168,15 +205,19 @@ export class PageWorker
|
||||||
this.crashed = false;
|
this.crashed = false;
|
||||||
this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject);
|
this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject);
|
||||||
|
|
||||||
this.logDetails = {page: this.page.url(), workerid};
|
this.logDetails = {page: page.url(), workerid};
|
||||||
|
|
||||||
// more serious page crash, mark as failed
|
// more serious page crash, mark as failed
|
||||||
this.page.on("error", (err) => {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
page.on("error", (err: any) => {
|
||||||
// ensure we're still on this page, otherwise ignore!
|
// ensure we're still on this page, otherwise ignore!
|
||||||
if (this.page === page) {
|
if (this.page === page) {
|
||||||
logger.error("Page Crashed", {...errJSON(err), ...this.logDetails}, "worker");
|
logger.error("Page Crashed", {...errJSON(err), ...this.logDetails}, "worker");
|
||||||
this.crashed = true;
|
this.crashed = true;
|
||||||
this.markCrashed("crashed");
|
if (this.markCrashed) {
|
||||||
|
this.markCrashed("crashed");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -204,9 +245,11 @@ export class PageWorker
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
throw new Error("no page available, shouldn't get here");
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawlPage(opts) {
|
async crawlPage(opts: WorkerState) {
|
||||||
const res = await this.crawler.crawlPage(opts);
|
const res = await this.crawler.crawlPage(opts);
|
||||||
if (this.recorder) {
|
if (this.recorder) {
|
||||||
await this.recorder.finishPage();
|
await this.recorder.finishPage();
|
||||||
|
@ -214,7 +257,7 @@ export class PageWorker
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
async timedCrawlPage(opts) {
|
async timedCrawlPage(opts: WorkerState) {
|
||||||
const workerid = this.id;
|
const workerid = this.id;
|
||||||
const { data } = opts;
|
const { data } = opts;
|
||||||
const { url } = data;
|
const { url } = data;
|
||||||
|
@ -244,7 +287,7 @@ export class PageWorker
|
||||||
]);
|
]);
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e.message !== "logged" && !this.crashed) {
|
if (e instanceof Error && e.message !== "logged" && !this.crashed) {
|
||||||
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
|
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -317,7 +360,7 @@ export class PageWorker
|
||||||
await sleep(0.5);
|
await sleep(0.5);
|
||||||
} else {
|
} else {
|
||||||
// if no pending and queue size is still empty, we're done!
|
// if no pending and queue size is still empty, we're done!
|
||||||
if (!await crawlState.queueSize()) {
|
if (!(await crawlState.queueSize())) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
|
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||||
class TestBehavior2
|
class TestBehavior2
|
||||||
{
|
{
|
||||||
static init() {
|
static init() {
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||||
class TestBehavior
|
class TestBehavior
|
||||||
{
|
{
|
||||||
static init() {
|
static init() {
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import { parseArgs } from "../util/argParser.js";
|
import { parseArgs } from "../dist/util/argParser.js";
|
||||||
|
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import { calculatePercentageUsed, checkDiskUtilization } from "../util/storage.js";
|
import { calculatePercentageUsed, checkDiskUtilization } from "../dist/util/storage.js";
|
||||||
|
|
||||||
|
|
||||||
test("ensure calculatePercentageUsed returns expected values", () => {
|
test("ensure calculatePercentageUsed returns expected values", () => {
|
||||||
|
|
107
tsconfig.json
Normal file
107
tsconfig.json
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
/* Visit https://aka.ms/tsconfig to read more about this file */
|
||||||
|
|
||||||
|
/* Projects */
|
||||||
|
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
|
||||||
|
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
|
||||||
|
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
|
||||||
|
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
|
||||||
|
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
|
||||||
|
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||||
|
|
||||||
|
/* Language and Environment */
|
||||||
|
"target": "es2022", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||||
|
"lib": ["es2022", "dom", "dom.iterable"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||||
|
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||||
|
// "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */
|
||||||
|
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||||
|
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
|
||||||
|
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
|
||||||
|
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
|
||||||
|
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
|
||||||
|
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
|
||||||
|
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
|
||||||
|
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||||
|
|
||||||
|
/* Modules */
|
||||||
|
"module": "NodeNext", /* Specify what module code is generated. */
|
||||||
|
"rootDir": "./src", /* Specify the root folder within your source files. */
|
||||||
|
"moduleResolution": "NodeNext", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||||
|
//"baseUrl": "./src", /* Specify the base directory to resolve non-relative module names. */
|
||||||
|
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||||
|
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
||||||
|
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
|
||||||
|
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
|
||||||
|
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
|
||||||
|
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
|
||||||
|
// "resolveJsonModule": true, /* Enable importing .json files. */
|
||||||
|
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||||
|
|
||||||
|
/* JavaScript Support */
|
||||||
|
"allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
||||||
|
"checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
||||||
|
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||||
|
|
||||||
|
/* Emit */
|
||||||
|
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
|
||||||
|
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
|
||||||
|
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||||
|
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||||
|
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||||
|
"outDir": "./dist/", /* Specify an output folder for all emitted files. */
|
||||||
|
// "removeComments": true, /* Disable emitting comments. */
|
||||||
|
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||||
|
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||||
|
// "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */
|
||||||
|
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
|
||||||
|
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
|
||||||
|
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
|
||||||
|
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
|
||||||
|
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
|
||||||
|
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
|
||||||
|
// "newLine": "crlf", /* Set the newline character for emitting files. */
|
||||||
|
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
|
||||||
|
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
|
||||||
|
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
|
||||||
|
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
|
||||||
|
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
|
||||||
|
// "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
|
||||||
|
|
||||||
|
/* Interop Constraints */
|
||||||
|
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
|
||||||
|
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||||
|
//"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
||||||
|
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||||
|
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
|
||||||
|
|
||||||
|
/* Type Checking */
|
||||||
|
"strict": true, /* Enable all strict type-checking options. */
|
||||||
|
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||||
|
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||||
|
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||||
|
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
|
||||||
|
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
|
||||||
|
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
|
||||||
|
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
|
||||||
|
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
|
||||||
|
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
|
||||||
|
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
|
||||||
|
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
|
||||||
|
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
|
||||||
|
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
|
||||||
|
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
|
||||||
|
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
|
||||||
|
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
|
||||||
|
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
|
||||||
|
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
|
||||||
|
|
||||||
|
/* Completeness */
|
||||||
|
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||||
|
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||||
|
},
|
||||||
|
|
||||||
|
"include": [
|
||||||
|
"src/**/*",
|
||||||
|
]
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue