mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Convert to ESM (#179)
* switch base image to chrome/chromium 105 with node 18.x * convert all source to esm for node 18.x, remove unneeded node-fetch dependency * ci: use node 18.x, update to latest actions * tests: convert to esm, run with --experimental-vm-modules * tests: set higher default timeout (90s) for all tests * tests: rename driver test fixture to .mjs for loading in jest * bump to 0.8.0
This commit is contained in:
parent
5b738bd24e
commit
277314f2de
31 changed files with 1178 additions and 2246 deletions
14
.github/workflows/ci.yaml
vendored
14
.github/workflows/ci.yaml
vendored
|
@ -9,12 +9,12 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
node-version: [14.x]
|
node-version: [18.x]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Use Node.js ${{ matrix.node-version }}
|
- name: Use Node.js ${{ matrix.node-version }}
|
||||||
uses: actions/setup-node@v1
|
uses: actions/setup-node@v3
|
||||||
with:
|
with:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
- name: install requirements
|
- name: install requirements
|
||||||
|
@ -28,12 +28,12 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
node-version: [14.x]
|
node-version: [18.x]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Use Node.js ${{ matrix.node-version }}
|
- name: Use Node.js ${{ matrix.node-version }}
|
||||||
uses: actions/setup-node@v1
|
uses: actions/setup-node@v3
|
||||||
with:
|
with:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
- name: install requirements
|
- name: install requirements
|
||||||
|
@ -41,7 +41,7 @@ jobs:
|
||||||
- name: build docker
|
- name: build docker
|
||||||
run: docker-compose build
|
run: docker-compose build
|
||||||
- name: run jest
|
- name: run jest
|
||||||
run: sudo yarn jest
|
run: sudo yarn test
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
2
.github/workflows/release.yaml
vendored
2
.github/workflows/release.yaml
vendored
|
@ -10,7 +10,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
-
|
-
|
||||||
name: Check out the repo
|
name: Check out the repo
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
-
|
-
|
||||||
name: Prepare
|
name: Prepare
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base
|
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base
|
||||||
ARG BROWSER_VERSION=101
|
ARG BROWSER_VERSION=105
|
||||||
|
|
||||||
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}
|
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}
|
||||||
|
|
||||||
|
|
76
crawler.js
76
crawler.js
|
@ -1,46 +1,50 @@
|
||||||
const child_process = require("child_process");
|
import child_process from "child_process";
|
||||||
const path = require("path");
|
import path from "path";
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
const os = require("os");
|
import os from "os";
|
||||||
const fsp = require("fs/promises");
|
import fsp from "fs/promises";
|
||||||
const http = require("http");
|
import http from "http";
|
||||||
const url = require("url");
|
import url from "url";
|
||||||
|
|
||||||
|
import fetch from "node-fetch";
|
||||||
|
import puppeteer from "puppeteer-core";
|
||||||
|
import { Cluster } from "puppeteer-cluster";
|
||||||
|
import { RedisCrawlState, MemoryCrawlState } from "./util/state.js";
|
||||||
|
import AbortController from "abort-controller";
|
||||||
|
import Sitemapper from "sitemapper";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
import yaml from "js-yaml";
|
||||||
|
|
||||||
|
import * as warcio from "warcio";
|
||||||
|
|
||||||
|
import { TextExtract } from "./util/textextract.js";
|
||||||
|
import { initStorage, getFileSize, getDirSize, interpolateFilename } from "./util/storage.js";
|
||||||
|
import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
|
||||||
|
import { parseArgs } from "./util/argParser.js";
|
||||||
|
import { initRedis } from "./util/redis.js";
|
||||||
|
|
||||||
|
import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } from "./util/browser.js";
|
||||||
|
|
||||||
|
import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";
|
||||||
|
|
||||||
|
import { BlockRules } from "./util/blockrules.js";
|
||||||
|
|
||||||
// to ignore HTTPS error for HEAD check
|
// to ignore HTTPS error for HEAD check
|
||||||
const HTTPS_AGENT = require("https").Agent({
|
import { Agent as HTTPAgent } from "http";
|
||||||
|
import { Agent as HTTPSAgent } from "https";
|
||||||
|
|
||||||
|
const HTTPS_AGENT = HTTPSAgent({
|
||||||
rejectUnauthorized: false,
|
rejectUnauthorized: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
const HTTP_AGENT = require("http").Agent();
|
const HTTP_AGENT = HTTPAgent();
|
||||||
|
|
||||||
const fetch = require("node-fetch");
|
const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
||||||
const puppeteer = require("puppeteer-core");
|
|
||||||
const { Cluster } = require("puppeteer-cluster");
|
|
||||||
const { RedisCrawlState, MemoryCrawlState } = require("./util/state");
|
|
||||||
const AbortController = require("abort-controller");
|
|
||||||
const Sitemapper = require("sitemapper");
|
|
||||||
const { v4: uuidv4 } = require("uuid");
|
|
||||||
const yaml = require("js-yaml");
|
|
||||||
|
|
||||||
const warcio = require("warcio");
|
|
||||||
|
|
||||||
const behaviors = fs.readFileSync(path.join(__dirname, "node_modules", "browsertrix-behaviors", "dist", "behaviors.js"), {encoding: "utf8"});
|
|
||||||
|
|
||||||
const TextExtract = require("./util/textextract");
|
|
||||||
const { initStorage, getFileSize, getDirSize, interpolateFilename } = require("./util/storage");
|
|
||||||
const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/screencaster");
|
|
||||||
const { parseArgs } = require("./util/argParser");
|
|
||||||
const { initRedis } = require("./util/redis");
|
|
||||||
|
|
||||||
const { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } = require("./util/browser");
|
|
||||||
|
|
||||||
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
|
|
||||||
|
|
||||||
const { BlockRules } = require("./util/blockrules");
|
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
class Crawler {
|
export class Crawler {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.headers = {};
|
this.headers = {};
|
||||||
this.crawlState = null;
|
this.crawlState = null;
|
||||||
|
@ -254,7 +258,7 @@ class Crawler {
|
||||||
|
|
||||||
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
|
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
|
||||||
|
|
||||||
subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts));
|
subprocesses.push(child_process.spawn("uwsgi", [new URL("uwsgi.ini", import.meta.url).pathname], opts));
|
||||||
|
|
||||||
process.on("exit", () => {
|
process.on("exit", () => {
|
||||||
for (const proc of subprocesses) {
|
for (const proc of subprocesses) {
|
||||||
|
@ -512,7 +516,8 @@ class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
this.driver = require(this.params.driver);
|
const driverUrl = new URL(this.params.driver, import.meta.url);
|
||||||
|
this.driver = (await import(driverUrl)).default;
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
console.warn(e);
|
console.warn(e);
|
||||||
return;
|
return;
|
||||||
|
@ -1257,4 +1262,3 @@ function shouldIgnoreAbort(req) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports.Crawler = Crawler;
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
|
|
||||||
module.exports = async ({data, page, crawler}) => {
|
export default async ({data, page, crawler}) => {
|
||||||
await crawler.loadPage(page, data);
|
await crawler.loadPage(page, data);
|
||||||
};
|
};
|
||||||
|
|
6
main.js
6
main.js
|
@ -1,5 +1,7 @@
|
||||||
#!/usr/bin/env node
|
#!/usr/bin/env node
|
||||||
|
|
||||||
|
import { Crawler } from "./crawler.js";
|
||||||
|
|
||||||
var crawler = null;
|
var crawler = null;
|
||||||
|
|
||||||
var lastSigInt = 0;
|
var lastSigInt = 0;
|
||||||
|
@ -54,10 +56,6 @@ process.on("SIGUSR2", () => {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
const { Crawler } = require("./crawler");
|
|
||||||
|
|
||||||
crawler = new Crawler();
|
crawler = new Crawler();
|
||||||
crawler.run();
|
crawler.run();
|
||||||
|
|
||||||
|
|
17
package.json
17
package.json
|
@ -1,34 +1,39 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.7.1",
|
"version": "0.8.0",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"lint": "eslint *.js util/*.js tests/*.test.js"
|
"lint": "eslint *.js util/*.js tests/*.test.js",
|
||||||
|
"test": "yarn node --experimental-vm-modules $(yarn bin jest)"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"abort-controller": "^3.0.0",
|
"abort-controller": "^3.0.0",
|
||||||
"browsertrix-behaviors": "^0.3.4",
|
"browsertrix-behaviors": "^0.3.4",
|
||||||
"get-folder-size": "2",
|
"get-folder-size": "^4.0.0",
|
||||||
"ioredis": "^4.27.1",
|
"ioredis": "^4.27.1",
|
||||||
"js-yaml": "^4.1.0",
|
"js-yaml": "^4.1.0",
|
||||||
"minio": "7.0.26",
|
"minio": "7.0.26",
|
||||||
"node-fetch": "^2.6.1",
|
|
||||||
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
||||||
"puppeteer-core": "^17.1.2",
|
"puppeteer-core": "^17.1.2",
|
||||||
"request": "^2.88.2",
|
"request": "^2.88.2",
|
||||||
"sitemapper": "^3.1.2",
|
"sitemapper": "^3.1.2",
|
||||||
"uuid": "8.3.2",
|
"uuid": "8.3.2",
|
||||||
"warcio": "1.5.1",
|
"warcio": "^1.6.0",
|
||||||
"ws": "^7.4.4",
|
"ws": "^7.4.4",
|
||||||
"yargs": "^16.0.3"
|
"yargs": "^16.0.3"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"eslint": "^7.20.0",
|
"eslint": "^7.20.0",
|
||||||
"eslint-plugin-react": "^7.22.0",
|
"eslint-plugin-react": "^7.22.0",
|
||||||
"jest": "^26.6.3",
|
"jest": "^29.2.1",
|
||||||
"md5": "^2.3.0"
|
"md5": "^2.3.0"
|
||||||
|
},
|
||||||
|
"jest": {
|
||||||
|
"transform": {},
|
||||||
|
"testTimeout": 90000
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
const child_process = require("child_process");
|
import child_process from "child_process";
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
const path = require("path");
|
import path from "path";
|
||||||
const md5 = require("md5");
|
import md5 from "md5";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
test("ensure basic crawl run with docker run passes", async () => {
|
test("ensure basic crawl run with docker run passes", async () => {
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
const yaml = require("js-yaml");
|
import child_process from "child_process";
|
||||||
const child_process = require("child_process");
|
import fs from "fs";
|
||||||
const fs = require("fs");
|
import yaml from "js-yaml";
|
||||||
|
|
||||||
function runCrawl(name, config, commandExtra = "") {
|
function runCrawl(name, config, commandExtra = "") {
|
||||||
config.generateCDX = true;
|
config.generateCDX = true;
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
const util = require("util");
|
import util from "util";
|
||||||
const exec = util.promisify(require("child_process").exec);
|
import {exec as execCallback } from "child_process";
|
||||||
|
|
||||||
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
|
|
||||||
test("check that the collection name is properly validated", async () => {
|
test("check that the collection name is properly validated", async () => {
|
||||||
jest.setTimeout(30000);
|
|
||||||
let passed = "";
|
let passed = "";
|
||||||
|
|
||||||
try{
|
try{
|
||||||
|
@ -17,7 +19,6 @@ test("check that the collection name is properly validated", async () => {
|
||||||
|
|
||||||
|
|
||||||
test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
|
test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
|
||||||
jest.setTimeout(30000);
|
|
||||||
let passed = "";
|
let passed = "";
|
||||||
|
|
||||||
try{
|
try{
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
const yaml = require("js-yaml");
|
import fs from "fs";
|
||||||
const util = require("util");
|
import yaml from "js-yaml";
|
||||||
const exec = util.promisify(require("child_process").exec);
|
|
||||||
const fs = require("fs");
|
import util from "util";
|
||||||
|
import {exec as execCallback } from "child_process";
|
||||||
|
|
||||||
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
|
|
||||||
test("check yaml config file with seed list is used", async () => {
|
test("check yaml config file with seed list is used", async () => {
|
||||||
jest.setTimeout(30000);
|
|
||||||
|
|
||||||
try{
|
try{
|
||||||
|
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
|
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
|
||||||
|
@ -41,8 +43,6 @@ test("check yaml config file with seed list is used", async () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check yaml config file will be overwritten by command line", async () => {
|
test("check yaml config file will be overwritten by command line", async () => {
|
||||||
jest.setTimeout(30000);
|
|
||||||
|
|
||||||
try{
|
try{
|
||||||
|
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
|
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
const yaml = require("js-yaml");
|
import child_process from "child_process";
|
||||||
const child_process = require("child_process");
|
import fs from "fs";
|
||||||
const fs = require("fs");
|
import yaml from "js-yaml";
|
||||||
|
|
||||||
test("pass config file via stdin", async () => {
|
test("pass config file via stdin", async () => {
|
||||||
jest.setTimeout(30000);
|
|
||||||
|
|
||||||
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
||||||
const config = yaml.load(configYaml);
|
const config = yaml.load(configYaml);
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
const child_process = require("child_process");
|
import child_process from "child_process";
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
|
|
||||||
|
|
||||||
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
|
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
|
||||||
jest.setTimeout(30000);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.js");
|
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs");
|
||||||
}
|
}
|
||||||
catch (error) {
|
catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
const util = require("util");
|
import fs from "fs";
|
||||||
const exec = util.promisify(require("child_process").exec);
|
|
||||||
const fs = require("fs");
|
import util from "util";
|
||||||
|
import {exec as execCallback } from "child_process";
|
||||||
|
|
||||||
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
||||||
jest.setTimeout(120000);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
|
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
module.exports = async ({data, page, crawler}) => {
|
export default async ({data, page, crawler}) => {
|
||||||
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
|
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
const { parseArgs } = require("../util/argParser");
|
import { parseArgs } from "../util/argParser.js";
|
||||||
|
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
|
|
||||||
function getSeeds(config) {
|
function getSeeds(config) {
|
||||||
const orig = fs.readFileSync;
|
const orig = fs.readFileSync;
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
const util = require("util");
|
import util from "util";
|
||||||
const exec = util.promisify(require("child_process").exec);
|
import {exec as execCallback } from "child_process";
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
|
|
||||||
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
test("check that URLs one-depth out from the seed-list are crawled", async () => {
|
test("check that URLs one-depth out from the seed-list are crawled", async () => {
|
||||||
jest.setTimeout(30000);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
|
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
const zlib = require("zlib");
|
import zlib from "zlib";
|
||||||
const child_process = require("child_process");
|
import child_process from "child_process";
|
||||||
|
|
||||||
test("check that the warcinfo file works as expected on the command line", async () => {
|
test("check that the warcinfo file works as expected on the command line", async () => {
|
||||||
jest.setTimeout(30000);
|
|
||||||
|
|
||||||
try{
|
try{
|
||||||
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
||||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
const path = require("path");
|
import path from "path";
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
const os = require("os");
|
import os from "os";
|
||||||
|
|
||||||
const yaml = require("js-yaml");
|
import yaml from "js-yaml";
|
||||||
const puppeteer = require("puppeteer-core");
|
import puppeteer from "puppeteer-core";
|
||||||
const { Cluster } = require("puppeteer-cluster");
|
import { Cluster } from "puppeteer-cluster";
|
||||||
const yargs = require("yargs/yargs");
|
import yargs from "yargs";
|
||||||
const { hideBin } = require("yargs/helpers");
|
import { hideBin } from "yargs/helpers";
|
||||||
|
|
||||||
const { ReuseWindowConcurrency } = require("./windowconcur");
|
import { ReuseWindowConcurrency } from "./windowconcur.js";
|
||||||
const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
|
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
|
||||||
const { ScopedSeed } = require("./seeds");
|
import { ScopedSeed } from "./seeds.js";
|
||||||
const { interpolateFilename } = require("./storage");
|
import { interpolateFilename } from "./storage.js";
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -127,7 +127,7 @@ class ArgParser {
|
||||||
"driver": {
|
"driver": {
|
||||||
describe: "JS driver for the crawler",
|
describe: "JS driver for the crawler",
|
||||||
type: "string",
|
type: "string",
|
||||||
default: path.join(__dirname, "..", "defaultDriver.js"),
|
default: "./defaultDriver.js",
|
||||||
},
|
},
|
||||||
|
|
||||||
"generateCDX": {
|
"generateCDX": {
|
||||||
|
@ -459,6 +459,6 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports.parseArgs = function(argv) {
|
export function parseArgs(argv) {
|
||||||
return new ArgParser().parseArgs(argv);
|
return new ArgParser().parseArgs(argv);
|
||||||
};
|
}
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
const fetch = require("node-fetch");
|
|
||||||
|
|
||||||
const RULE_TYPES = ["block", "allowOnly"];
|
const RULE_TYPES = ["block", "allowOnly"];
|
||||||
|
|
||||||
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
|
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
|
||||||
|
@ -44,7 +42,7 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class BlockRules
|
export class BlockRules
|
||||||
{
|
{
|
||||||
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
|
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
|
||||||
this.rules = [];
|
this.rules = [];
|
||||||
|
@ -224,6 +222,3 @@ class BlockRules
|
||||||
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports.BlockRules = BlockRules;
|
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
const child_process = require("child_process");
|
import child_process from "child_process";
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
const path = require("path");
|
import path from "path";
|
||||||
const os = require("os");
|
import os from "os";
|
||||||
const request = require("request");
|
import request from "request";
|
||||||
const { initStorage } = require("./storage");
|
import { initStorage } from "./storage.js";
|
||||||
|
|
||||||
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||||
|
|
||||||
module.exports.loadProfile = async function(profileFilename) {
|
export async function loadProfile(profileFilename) {
|
||||||
const targetFilename = "/tmp/profile.tar.gz";
|
const targetFilename = "/tmp/profile.tar.gz";
|
||||||
|
|
||||||
if (profileFilename &&
|
if (profileFilename &&
|
||||||
|
@ -46,13 +46,13 @@ module.exports.loadProfile = async function(profileFilename) {
|
||||||
}
|
}
|
||||||
|
|
||||||
return profileDir;
|
return profileDir;
|
||||||
};
|
}
|
||||||
|
|
||||||
module.exports.saveProfile = function(profileFilename) {
|
export function saveProfile(profileFilename) {
|
||||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
|
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
|
||||||
};
|
}
|
||||||
|
|
||||||
function getBrowserExe() {
|
export function getBrowserExe() {
|
||||||
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
||||||
for (const file of files) {
|
for (const file of files) {
|
||||||
if (file && fs.existsSync(file)) {
|
if (file && fs.existsSync(file)) {
|
||||||
|
@ -64,10 +64,8 @@ function getBrowserExe() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
module.exports.getBrowserExe = getBrowserExe;
|
|
||||||
|
|
||||||
|
export function getDefaultUA() {
|
||||||
function getDefaultUA() {
|
|
||||||
let version = process.env.BROWSER_VERSION;
|
let version = process.env.BROWSER_VERSION;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -81,7 +79,6 @@ function getDefaultUA() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
module.exports.getDefaultUA = getDefaultUA;
|
|
||||||
|
|
||||||
|
|
||||||
// from https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/chromium/chromium.ts#L327
|
// from https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/chromium/chromium.ts#L327
|
||||||
|
@ -120,7 +117,7 @@ const DEFAULT_PLAYWRIGHT_FLAGS = [
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|
||||||
module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
|
export function chromeArgs (proxy, userAgent=null, extraArgs=[]) {
|
||||||
// Chrome Flags, including proxy server
|
// Chrome Flags, including proxy server
|
||||||
const args = [
|
const args = [
|
||||||
...DEFAULT_PLAYWRIGHT_FLAGS,
|
...DEFAULT_PLAYWRIGHT_FLAGS,
|
||||||
|
@ -140,10 +137,10 @@ module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
return args;
|
return args;
|
||||||
};
|
}
|
||||||
|
|
||||||
|
|
||||||
module.exports.evaluateWithCLI = async (frame, funcString) => {
|
export async function evaluateWithCLI(frame, funcString) {
|
||||||
const context = await frame.executionContext();
|
const context = await frame.executionContext();
|
||||||
|
|
||||||
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
|
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
|
||||||
|
@ -167,12 +164,12 @@ module.exports.evaluateWithCLI = async (frame, funcString) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
return remoteObject.value;
|
return remoteObject.value;
|
||||||
};
|
}
|
||||||
|
|
||||||
|
|
||||||
module.exports.sleep = async (time) => {
|
export async function sleep(time) {
|
||||||
return new Promise(resolve => setTimeout(resolve, time));
|
return new Promise(resolve => setTimeout(resolve, time));
|
||||||
};
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
|
|
||||||
module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||||
module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||||
module.exports.BEHAVIOR_LOG_FUNC = "__bx_log";
|
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||||
|
|
||||||
module.exports.DEFAULT_SELECTORS = [{
|
export const DEFAULT_SELECTORS = [{
|
||||||
selector: "a[href]",
|
selector: "a[href]",
|
||||||
extract: "href",
|
extract: "href",
|
||||||
isAttribute: false
|
isAttribute: false
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
const Redis = require("ioredis");
|
import Redis from "ioredis";
|
||||||
|
|
||||||
module.exports.initRedis = async function(url) {
|
export async function initRedis(url) {
|
||||||
const redis = new Redis(url, {lazyConnect: true});
|
const redis = new Redis(url, {lazyConnect: true});
|
||||||
await redis.connect();
|
await redis.connect();
|
||||||
return redis;
|
return redis;
|
||||||
};
|
}
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
const ws = require("ws");
|
import ws from "ws";
|
||||||
const http = require("http");
|
import http from "http";
|
||||||
const url = require("url");
|
import url from "url";
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
const path = require("path");
|
|
||||||
|
|
||||||
const { initRedis } = require("./redis");
|
import { initRedis } from "./redis.js";
|
||||||
|
|
||||||
const indexHTML = fs.readFileSync(path.join(__dirname, "..", "html", "screencast.html"), {encoding: "utf8"});
|
const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
|
@ -304,4 +303,4 @@ class ScreenCaster
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { ScreenCaster, WSTransport, RedisPubSubTransport };
|
export { ScreenCaster, WSTransport, RedisPubSubTransport };
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
class ScopedSeed
|
export class ScopedSeed
|
||||||
{
|
{
|
||||||
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
|
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
|
||||||
const parsedUrl = this.parseUrl(url);
|
const parsedUrl = this.parseUrl(url);
|
||||||
|
@ -160,16 +160,14 @@ class ScopedSeed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function rxEscape(string) {
|
export function rxEscape(string) {
|
||||||
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
||||||
}
|
}
|
||||||
|
|
||||||
function urlRxEscape(url, parsedUrl) {
|
export function urlRxEscape(url, parsedUrl) {
|
||||||
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
module.exports.ScopedSeed = ScopedSeed;
|
|
||||||
module.exports.rxEscape = rxEscape;
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
const Job = require("puppeteer-cluster/dist/Job").default;
|
import mod from "puppeteer-cluster/dist/Job.js";
|
||||||
|
const Job = mod.default;
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
class BaseState
|
export class BaseState
|
||||||
{
|
{
|
||||||
constructor() {
|
constructor() {
|
||||||
this.drainMax = 0;
|
this.drainMax = 0;
|
||||||
|
@ -43,7 +44,7 @@ class BaseState
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
class MemoryCrawlState extends BaseState
|
export class MemoryCrawlState extends BaseState
|
||||||
{
|
{
|
||||||
constructor() {
|
constructor() {
|
||||||
super();
|
super();
|
||||||
|
@ -177,7 +178,7 @@ class MemoryCrawlState extends BaseState
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
class RedisCrawlState extends BaseState
|
export class RedisCrawlState extends BaseState
|
||||||
{
|
{
|
||||||
constructor(redis, key, pageTimeout, uid) {
|
constructor(redis, key, pageTimeout, uid) {
|
||||||
super();
|
super();
|
||||||
|
@ -455,5 +456,3 @@ return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports.RedisCrawlState = RedisCrawlState;
|
|
||||||
module.exports.MemoryCrawlState = MemoryCrawlState;
|
|
||||||
|
|
|
@ -1,20 +1,18 @@
|
||||||
const fs = require("fs");
|
import fs from "fs";
|
||||||
const fsp = require("fs/promises");
|
import fsp from "fs/promises";
|
||||||
|
|
||||||
const os = require("os");
|
import os from "os";
|
||||||
const { createHash } = require("crypto");
|
import { createHash } from "crypto";
|
||||||
|
|
||||||
const fetch = require("node-fetch");
|
import Minio from "minio";
|
||||||
const Minio = require("minio");
|
|
||||||
|
|
||||||
const { initRedis } = require("./redis");
|
import { initRedis } from "./redis.js";
|
||||||
|
|
||||||
const util = require("util");
|
import getFolderSize from "get-folder-size";
|
||||||
const getFolderSize = util.promisify(require("get-folder-size"));
|
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class S3StorageSync
|
export class S3StorageSync
|
||||||
{
|
{
|
||||||
constructor(urlOrData, {webhookUrl, userId, crawlId} = {}) {
|
constructor(urlOrData, {webhookUrl, userId, crawlId} = {}) {
|
||||||
let url;
|
let url;
|
||||||
|
@ -110,7 +108,7 @@ class S3StorageSync
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function initStorage() {
|
export function initStorage() {
|
||||||
if (!process.env.STORE_ENDPOINT_URL) {
|
if (!process.env.STORE_ENDPOINT_URL) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -133,12 +131,12 @@ function initStorage() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async function getFileSize(filename) {
|
export async function getFileSize(filename) {
|
||||||
const stats = await fsp.stat(filename);
|
const stats = await fsp.stat(filename);
|
||||||
return stats.size;
|
return stats.size;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getDirSize(dir) {
|
export async function getDirSize(dir) {
|
||||||
return await getFolderSize(dir);
|
return await getFolderSize(dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -152,7 +150,7 @@ function checksumFile(hashName, path) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function interpolateFilename(filename, crawlId) {
|
export function interpolateFilename(filename, crawlId) {
|
||||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
||||||
filename = filename.replace("@hostname", os.hostname());
|
filename = filename.replace("@hostname", os.hostname());
|
||||||
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
||||||
|
@ -160,8 +158,3 @@ function interpolateFilename(filename, crawlId) {
|
||||||
return filename;
|
return filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports.S3StorageSync = S3StorageSync;
|
|
||||||
module.exports.getFileSize = getFileSize;
|
|
||||||
module.exports.getDirSize = getDirSize;
|
|
||||||
module.exports.initStorage = initStorage;
|
|
||||||
module.exports.interpolateFilename = interpolateFilename;
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
class TextExtract {
|
export class TextExtract {
|
||||||
|
|
||||||
constructor(dom){
|
constructor(dom){
|
||||||
this.dom = dom;
|
this.dom = dom;
|
||||||
|
@ -56,4 +56,3 @@ class TextExtract {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = TextExtract;
|
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
|
import sbi from "puppeteer-cluster/dist/concurrency/SingleBrowserImplementation.js";
|
||||||
|
|
||||||
|
const SingleBrowserImplementation = sbi.default;
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class ReuseWindowConcurrency extends SingleBrowserImplementation {
|
export class ReuseWindowConcurrency extends SingleBrowserImplementation {
|
||||||
async init() {
|
async init() {
|
||||||
await super.init();
|
await super.init();
|
||||||
|
|
||||||
|
@ -110,6 +112,3 @@ class ReuseWindowConcurrency extends SingleBrowserImplementation {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { ReuseWindowConcurrency };
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue