Convert to ESM (#179)

* switch base image to chrome/chromium 105 with node 18.x
* convert all source to esm for node 18.x, remove unneeded node-fetch dependency
* ci: use node 18.x, update to latest actions
* tests: convert to esm, run with --experimental-vm-modules
* tests: set higher default timeout (90s) for all tests
* tests: rename driver test fixture to .mjs for loading in jest
* bump to 0.8.0
This commit is contained in:
Ilya Kreymer 2022-10-24 15:30:10 +02:00 committed by Ilya Kreymer
parent 5b738bd24e
commit 277314f2de
31 changed files with 1178 additions and 2246 deletions

View file

@ -9,12 +9,12 @@ jobs:
strategy: strategy:
matrix: matrix:
node-version: [14.x] node-version: [18.x]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Use Node.js ${{ matrix.node-version }} - name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v1 uses: actions/setup-node@v3
with: with:
node-version: ${{ matrix.node-version }} node-version: ${{ matrix.node-version }}
- name: install requirements - name: install requirements
@ -28,12 +28,12 @@ jobs:
strategy: strategy:
matrix: matrix:
node-version: [14.x] node-version: [18.x]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Use Node.js ${{ matrix.node-version }} - name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v1 uses: actions/setup-node@v3
with: with:
node-version: ${{ matrix.node-version }} node-version: ${{ matrix.node-version }}
- name: install requirements - name: install requirements
@ -41,7 +41,7 @@ jobs:
- name: build docker - name: build docker
run: docker-compose build run: docker-compose build
- name: run jest - name: run jest
run: sudo yarn jest run: sudo yarn test

View file

@ -10,7 +10,7 @@ jobs:
steps: steps:
- -
name: Check out the repo name: Check out the repo
uses: actions/checkout@v2 uses: actions/checkout@v3
- -
name: Prepare name: Prepare

View file

@ -1,5 +1,5 @@
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base
ARG BROWSER_VERSION=101 ARG BROWSER_VERSION=105
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}

View file

@ -1,46 +1,50 @@
const child_process = require("child_process"); import child_process from "child_process";
const path = require("path"); import path from "path";
const fs = require("fs"); import fs from "fs";
const os = require("os"); import os from "os";
const fsp = require("fs/promises"); import fsp from "fs/promises";
const http = require("http"); import http from "http";
const url = require("url"); import url from "url";
import fetch from "node-fetch";
import puppeteer from "puppeteer-core";
import { Cluster } from "puppeteer-cluster";
import { RedisCrawlState, MemoryCrawlState } from "./util/state.js";
import AbortController from "abort-controller";
import Sitemapper from "sitemapper";
import { v4 as uuidv4 } from "uuid";
import yaml from "js-yaml";
import * as warcio from "warcio";
import { TextExtract } from "./util/textextract.js";
import { initStorage, getFileSize, getDirSize, interpolateFilename } from "./util/storage.js";
import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
import { parseArgs } from "./util/argParser.js";
import { initRedis } from "./util/redis.js";
import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } from "./util/browser.js";
import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";
import { BlockRules } from "./util/blockrules.js";
// to ignore HTTPS error for HEAD check // to ignore HTTPS error for HEAD check
const HTTPS_AGENT = require("https").Agent({ import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
const HTTPS_AGENT = HTTPSAgent({
rejectUnauthorized: false, rejectUnauthorized: false,
}); });
const HTTP_AGENT = require("http").Agent(); const HTTP_AGENT = HTTPAgent();
const fetch = require("node-fetch"); const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
const puppeteer = require("puppeteer-core");
const { Cluster } = require("puppeteer-cluster");
const { RedisCrawlState, MemoryCrawlState } = require("./util/state");
const AbortController = require("abort-controller");
const Sitemapper = require("sitemapper");
const { v4: uuidv4 } = require("uuid");
const yaml = require("js-yaml");
const warcio = require("warcio");
const behaviors = fs.readFileSync(path.join(__dirname, "node_modules", "browsertrix-behaviors", "dist", "behaviors.js"), {encoding: "utf8"});
const TextExtract = require("./util/textextract");
const { initStorage, getFileSize, getDirSize, interpolateFilename } = require("./util/storage");
const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/screencaster");
const { parseArgs } = require("./util/argParser");
const { initRedis } = require("./util/redis");
const { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } = require("./util/browser");
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
const { BlockRules } = require("./util/blockrules");
// ============================================================================ // ============================================================================
class Crawler { export class Crawler {
constructor() { constructor() {
this.headers = {}; this.headers = {};
this.crawlState = null; this.crawlState = null;
@ -254,7 +258,7 @@ class Crawler {
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize}; opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts)); subprocesses.push(child_process.spawn("uwsgi", [new URL("uwsgi.ini", import.meta.url).pathname], opts));
process.on("exit", () => { process.on("exit", () => {
for (const proc of subprocesses) { for (const proc of subprocesses) {
@ -512,7 +516,8 @@ class Crawler {
} }
try { try {
this.driver = require(this.params.driver); const driverUrl = new URL(this.params.driver, import.meta.url);
this.driver = (await import(driverUrl)).default;
} catch(e) { } catch(e) {
console.warn(e); console.warn(e);
return; return;
@ -1257,4 +1262,3 @@ function shouldIgnoreAbort(req) {
} }
} }
module.exports.Crawler = Crawler;

View file

@ -1,4 +1,4 @@
module.exports = async ({data, page, crawler}) => { export default async ({data, page, crawler}) => {
await crawler.loadPage(page, data); await crawler.loadPage(page, data);
}; };

View file

@ -1,5 +1,7 @@
#!/usr/bin/env node #!/usr/bin/env node
import { Crawler } from "./crawler.js";
var crawler = null; var crawler = null;
var lastSigInt = 0; var lastSigInt = 0;
@ -54,10 +56,6 @@ process.on("SIGUSR2", () => {
} }
}); });
const { Crawler } = require("./crawler");
crawler = new Crawler(); crawler = new Crawler();
crawler.run(); crawler.run();

View file

@ -1,34 +1,39 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "0.7.1", "version": "0.8.0",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software", "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
"license": "MIT", "license": "MIT",
"scripts": { "scripts": {
"lint": "eslint *.js util/*.js tests/*.test.js" "lint": "eslint *.js util/*.js tests/*.test.js",
"test": "yarn node --experimental-vm-modules $(yarn bin jest)"
}, },
"dependencies": { "dependencies": {
"abort-controller": "^3.0.0", "abort-controller": "^3.0.0",
"browsertrix-behaviors": "^0.3.4", "browsertrix-behaviors": "^0.3.4",
"get-folder-size": "2", "get-folder-size": "^4.0.0",
"ioredis": "^4.27.1", "ioredis": "^4.27.1",
"js-yaml": "^4.1.0", "js-yaml": "^4.1.0",
"minio": "7.0.26", "minio": "7.0.26",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue", "puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
"puppeteer-core": "^17.1.2", "puppeteer-core": "^17.1.2",
"request": "^2.88.2", "request": "^2.88.2",
"sitemapper": "^3.1.2", "sitemapper": "^3.1.2",
"uuid": "8.3.2", "uuid": "8.3.2",
"warcio": "1.5.1", "warcio": "^1.6.0",
"ws": "^7.4.4", "ws": "^7.4.4",
"yargs": "^16.0.3" "yargs": "^16.0.3"
}, },
"devDependencies": { "devDependencies": {
"eslint": "^7.20.0", "eslint": "^7.20.0",
"eslint-plugin-react": "^7.22.0", "eslint-plugin-react": "^7.22.0",
"jest": "^26.6.3", "jest": "^29.2.1",
"md5": "^2.3.0" "md5": "^2.3.0"
},
"jest": {
"transform": {},
"testTimeout": 90000
} }
} }

View file

@ -1,7 +1,9 @@
const child_process = require("child_process"); import child_process from "child_process";
const fs = require("fs"); import fs from "fs";
const path = require("path"); import path from "path";
const md5 = require("md5"); import md5 from "md5";
test("ensure basic crawl run with docker run passes", async () => { test("ensure basic crawl run with docker run passes", async () => {

View file

@ -1,6 +1,6 @@
const yaml = require("js-yaml"); import child_process from "child_process";
const child_process = require("child_process"); import fs from "fs";
const fs = require("fs"); import yaml from "js-yaml";
function runCrawl(name, config, commandExtra = "") { function runCrawl(name, config, commandExtra = "") {
config.generateCDX = true; config.generateCDX = true;

View file

@ -1,8 +1,10 @@
const util = require("util"); import util from "util";
const exec = util.promisify(require("child_process").exec); import {exec as execCallback } from "child_process";
const exec = util.promisify(execCallback);
test("check that the collection name is properly validated", async () => { test("check that the collection name is properly validated", async () => {
jest.setTimeout(30000);
let passed = ""; let passed = "";
try{ try{
@ -17,7 +19,6 @@ test("check that the collection name is properly validated", async () => {
test("check that the collection name is not accepted if it doesn't meets our standards", async () => { test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
jest.setTimeout(30000);
let passed = ""; let passed = "";
try{ try{

View file

@ -1,11 +1,13 @@
const yaml = require("js-yaml"); import fs from "fs";
const util = require("util"); import yaml from "js-yaml";
const exec = util.promisify(require("child_process").exec);
const fs = require("fs"); import util from "util";
import {exec as execCallback } from "child_process";
const exec = util.promisify(execCallback);
test("check yaml config file with seed list is used", async () => { test("check yaml config file with seed list is used", async () => {
jest.setTimeout(30000);
try{ try{
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0"); await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
@ -41,8 +43,6 @@ test("check yaml config file with seed list is used", async () => {
}); });
test("check yaml config file will be overwritten by command line", async () => { test("check yaml config file will be overwritten by command line", async () => {
jest.setTimeout(30000);
try{ try{
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000"); await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");

View file

@ -1,10 +1,8 @@
const yaml = require("js-yaml"); import child_process from "child_process";
const child_process = require("child_process"); import fs from "fs";
const fs = require("fs"); import yaml from "js-yaml";
test("pass config file via stdin", async () => { test("pass config file via stdin", async () => {
jest.setTimeout(30000);
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8"); const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
const config = yaml.load(configYaml); const config = yaml.load(configYaml);

View file

@ -1,11 +1,10 @@
const child_process = require("child_process"); import child_process from "child_process";
const fs = require("fs"); import fs from "fs";
test("ensure custom driver with custom selector crawls JS files as pages", async () => { test("ensure custom driver with custom selector crawls JS files as pages", async () => {
jest.setTimeout(30000);
try { try {
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.js"); child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs");
} }
catch (error) { catch (error) {
console.log(error); console.log(error);

View file

@ -1,10 +1,13 @@
const util = require("util"); import fs from "fs";
const exec = util.promisify(require("child_process").exec);
const fs = require("fs"); import util from "util";
import {exec as execCallback } from "child_process";
const exec = util.promisify(execCallback);
test("check that URLs are crawled 2 extra hops beyond depth", async () => { test("check that URLs are crawled 2 extra hops beyond depth", async () => {
jest.setTimeout(120000);
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7"); await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
} }

View file

@ -1,4 +1,4 @@
module.exports = async ({data, page, crawler}) => { export default async ({data, page, crawler}) => {
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]); await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
}; };

View file

@ -1,6 +1,6 @@
const { parseArgs } = require("../util/argParser"); import { parseArgs } from "../util/argParser.js";
const fs = require("fs"); import fs from "fs";
function getSeeds(config) { function getSeeds(config) {
const orig = fs.readFileSync; const orig = fs.readFileSync;

View file

@ -1,10 +1,10 @@
const util = require("util"); import util from "util";
const exec = util.promisify(require("child_process").exec); import {exec as execCallback } from "child_process";
const fs = require("fs"); import fs from "fs";
const exec = util.promisify(execCallback);
test("check that URLs one-depth out from the seed-list are crawled", async () => { test("check that URLs one-depth out from the seed-list are crawled", async () => {
jest.setTimeout(30000);
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000"); await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");

View file

@ -1,10 +1,8 @@
const fs = require("fs"); import fs from "fs";
const zlib = require("zlib"); import zlib from "zlib";
const child_process = require("child_process"); import child_process from "child_process";
test("check that the warcinfo file works as expected on the command line", async () => { test("check that the warcinfo file works as expected on the command line", async () => {
jest.setTimeout(30000);
try{ try{
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8"); const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"}); const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});

View file

@ -1,17 +1,17 @@
const path = require("path"); import path from "path";
const fs = require("fs"); import fs from "fs";
const os = require("os"); import os from "os";
const yaml = require("js-yaml"); import yaml from "js-yaml";
const puppeteer = require("puppeteer-core"); import puppeteer from "puppeteer-core";
const { Cluster } = require("puppeteer-cluster"); import { Cluster } from "puppeteer-cluster";
const yargs = require("yargs/yargs"); import yargs from "yargs";
const { hideBin } = require("yargs/helpers"); import { hideBin } from "yargs/helpers";
const { ReuseWindowConcurrency } = require("./windowconcur"); import { ReuseWindowConcurrency } from "./windowconcur.js";
const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants"); import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
const { ScopedSeed } = require("./seeds"); import { ScopedSeed } from "./seeds.js";
const { interpolateFilename } = require("./storage"); import { interpolateFilename } from "./storage.js";
// ============================================================================ // ============================================================================
@ -127,7 +127,7 @@ class ArgParser {
"driver": { "driver": {
describe: "JS driver for the crawler", describe: "JS driver for the crawler",
type: "string", type: "string",
default: path.join(__dirname, "..", "defaultDriver.js"), default: "./defaultDriver.js",
}, },
"generateCDX": { "generateCDX": {
@ -459,6 +459,6 @@ class ArgParser {
} }
} }
module.exports.parseArgs = function(argv) { export function parseArgs(argv) {
return new ArgParser().parseArgs(argv); return new ArgParser().parseArgs(argv);
}; }

View file

@ -1,5 +1,3 @@
const fetch = require("node-fetch");
const RULE_TYPES = ["block", "allowOnly"]; const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"]; const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
@ -44,7 +42,7 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
// =========================================================================== // ===========================================================================
class BlockRules export class BlockRules
{ {
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) { constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
this.rules = []; this.rules = [];
@ -224,6 +222,3 @@ class BlockRules
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body}); await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
} }
} }
module.exports.BlockRules = BlockRules;

View file

@ -1,13 +1,13 @@
const child_process = require("child_process"); import child_process from "child_process";
const fs = require("fs"); import fs from "fs";
const path = require("path"); import path from "path";
const os = require("os"); import os from "os";
const request = require("request"); import request from "request";
const { initStorage } = require("./storage"); import { initStorage } from "./storage.js";
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-")); const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
module.exports.loadProfile = async function(profileFilename) { export async function loadProfile(profileFilename) {
const targetFilename = "/tmp/profile.tar.gz"; const targetFilename = "/tmp/profile.tar.gz";
if (profileFilename && if (profileFilename &&
@ -46,13 +46,13 @@ module.exports.loadProfile = async function(profileFilename) {
} }
return profileDir; return profileDir;
}; }
module.exports.saveProfile = function(profileFilename) { export function saveProfile(profileFilename) {
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir}); child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
}; }
function getBrowserExe() { export function getBrowserExe() {
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"]; const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
for (const file of files) { for (const file of files) {
if (file && fs.existsSync(file)) { if (file && fs.existsSync(file)) {
@ -64,10 +64,8 @@ function getBrowserExe() {
} }
module.exports.getBrowserExe = getBrowserExe;
export function getDefaultUA() {
function getDefaultUA() {
let version = process.env.BROWSER_VERSION; let version = process.env.BROWSER_VERSION;
try { try {
@ -81,7 +79,6 @@ function getDefaultUA() {
} }
module.exports.getDefaultUA = getDefaultUA;
// from https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/chromium/chromium.ts#L327 // from https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/chromium/chromium.ts#L327
@ -120,7 +117,7 @@ const DEFAULT_PLAYWRIGHT_FLAGS = [
]; ];
module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => { export function chromeArgs (proxy, userAgent=null, extraArgs=[]) {
// Chrome Flags, including proxy server // Chrome Flags, including proxy server
const args = [ const args = [
...DEFAULT_PLAYWRIGHT_FLAGS, ...DEFAULT_PLAYWRIGHT_FLAGS,
@ -140,10 +137,10 @@ module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
} }
return args; return args;
}; }
module.exports.evaluateWithCLI = async (frame, funcString) => { export async function evaluateWithCLI(frame, funcString) {
const context = await frame.executionContext(); const context = await frame.executionContext();
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true // from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
@ -167,12 +164,12 @@ module.exports.evaluateWithCLI = async (frame, funcString) => {
} }
return remoteObject.value; return remoteObject.value;
}; }
module.exports.sleep = async (time) => { export async function sleep(time) {
return new Promise(resolve => setTimeout(resolve, time)); return new Promise(resolve => setTimeout(resolve, time));
}; }

View file

@ -1,9 +1,9 @@
module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"]; export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
module.exports.BEHAVIOR_LOG_FUNC = "__bx_log"; export const BEHAVIOR_LOG_FUNC = "__bx_log";
module.exports.DEFAULT_SELECTORS = [{ export const DEFAULT_SELECTORS = [{
selector: "a[href]", selector: "a[href]",
extract: "href", extract: "href",
isAttribute: false isAttribute: false

View file

@ -1,7 +1,7 @@
const Redis = require("ioredis"); import Redis from "ioredis";
module.exports.initRedis = async function(url) { export async function initRedis(url) {
const redis = new Redis(url, {lazyConnect: true}); const redis = new Redis(url, {lazyConnect: true});
await redis.connect(); await redis.connect();
return redis; return redis;
}; }

View file

@ -1,12 +1,11 @@
const ws = require("ws"); import ws from "ws";
const http = require("http"); import http from "http";
const url = require("url"); import url from "url";
const fs = require("fs"); import fs from "fs";
const path = require("path");
const { initRedis } = require("./redis"); import { initRedis } from "./redis.js";
const indexHTML = fs.readFileSync(path.join(__dirname, "..", "html", "screencast.html"), {encoding: "utf8"}); const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"});
// =========================================================================== // ===========================================================================
@ -304,4 +303,4 @@ class ScreenCaster
} }
} }
module.exports = { ScreenCaster, WSTransport, RedisPubSubTransport }; export { ScreenCaster, WSTransport, RedisPubSubTransport };

View file

@ -1,4 +1,4 @@
class ScopedSeed export class ScopedSeed
{ {
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) { constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
const parsedUrl = this.parseUrl(url); const parsedUrl = this.parseUrl(url);
@ -160,16 +160,14 @@ class ScopedSeed
} }
} }
function rxEscape(string) { export function rxEscape(string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&"); return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
} }
function urlRxEscape(url, parsedUrl) { export function urlRxEscape(url, parsedUrl) {
return rxEscape(url).replace(parsedUrl.protocol, "https?:"); return rxEscape(url).replace(parsedUrl.protocol, "https?:");
} }
module.exports.ScopedSeed = ScopedSeed;
module.exports.rxEscape = rxEscape;

View file

@ -1,8 +1,9 @@
const Job = require("puppeteer-cluster/dist/Job").default; import mod from "puppeteer-cluster/dist/Job.js";
const Job = mod.default;
// ============================================================================ // ============================================================================
class BaseState export class BaseState
{ {
constructor() { constructor() {
this.drainMax = 0; this.drainMax = 0;
@ -43,7 +44,7 @@ class BaseState
// ============================================================================ // ============================================================================
class MemoryCrawlState extends BaseState export class MemoryCrawlState extends BaseState
{ {
constructor() { constructor() {
super(); super();
@ -177,7 +178,7 @@ class MemoryCrawlState extends BaseState
// ============================================================================ // ============================================================================
class RedisCrawlState extends BaseState export class RedisCrawlState extends BaseState
{ {
constructor(redis, key, pageTimeout, uid) { constructor(redis, key, pageTimeout, uid) {
super(); super();
@ -455,5 +456,3 @@ return 0;
} }
} }
module.exports.RedisCrawlState = RedisCrawlState;
module.exports.MemoryCrawlState = MemoryCrawlState;

View file

@ -1,20 +1,18 @@
const fs = require("fs"); import fs from "fs";
const fsp = require("fs/promises"); import fsp from "fs/promises";
const os = require("os"); import os from "os";
const { createHash } = require("crypto"); import { createHash } from "crypto";
const fetch = require("node-fetch"); import Minio from "minio";
const Minio = require("minio");
const { initRedis } = require("./redis"); import { initRedis } from "./redis.js";
const util = require("util"); import getFolderSize from "get-folder-size";
const getFolderSize = util.promisify(require("get-folder-size"));
// =========================================================================== // ===========================================================================
class S3StorageSync export class S3StorageSync
{ {
constructor(urlOrData, {webhookUrl, userId, crawlId} = {}) { constructor(urlOrData, {webhookUrl, userId, crawlId} = {}) {
let url; let url;
@ -110,7 +108,7 @@ class S3StorageSync
} }
} }
function initStorage() { export function initStorage() {
if (!process.env.STORE_ENDPOINT_URL) { if (!process.env.STORE_ENDPOINT_URL) {
return null; return null;
} }
@ -133,12 +131,12 @@ function initStorage() {
} }
async function getFileSize(filename) { export async function getFileSize(filename) {
const stats = await fsp.stat(filename); const stats = await fsp.stat(filename);
return stats.size; return stats.size;
} }
async function getDirSize(dir) { export async function getDirSize(dir) {
return await getFolderSize(dir); return await getFolderSize(dir);
} }
@ -152,7 +150,7 @@ function checksumFile(hashName, path) {
}); });
} }
function interpolateFilename(filename, crawlId) { export function interpolateFilename(filename, crawlId) {
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, "")); filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
filename = filename.replace("@hostname", os.hostname()); filename = filename.replace("@hostname", os.hostname());
filename = filename.replace("@hostsuffix", os.hostname().slice(-14)); filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
@ -160,8 +158,3 @@ function interpolateFilename(filename, crawlId) {
return filename; return filename;
} }
module.exports.S3StorageSync = S3StorageSync;
module.exports.getFileSize = getFileSize;
module.exports.getDirSize = getDirSize;
module.exports.initStorage = initStorage;
module.exports.interpolateFilename = interpolateFilename;

View file

@ -1,4 +1,4 @@
class TextExtract { export class TextExtract {
constructor(dom){ constructor(dom){
this.dom = dom; this.dom = dom;
@ -56,4 +56,3 @@ class TextExtract {
} }
} }
module.exports = TextExtract;

View file

@ -1,8 +1,10 @@
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default; import sbi from "puppeteer-cluster/dist/concurrency/SingleBrowserImplementation.js";
const SingleBrowserImplementation = sbi.default;
// =========================================================================== // ===========================================================================
class ReuseWindowConcurrency extends SingleBrowserImplementation { export class ReuseWindowConcurrency extends SingleBrowserImplementation {
async init() { async init() {
await super.init(); await super.init();
@ -110,6 +112,3 @@ class ReuseWindowConcurrency extends SingleBrowserImplementation {
} }
} }
module.exports = { ReuseWindowConcurrency };

3043
yarn.lock

File diff suppressed because it is too large Load diff