mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
feat: Add custom behavior injection (#285)
* support loading custom behaviors from a specified directory via --customBehaviors * call load() for each behavior incrementally, then call selectMainBehavior() (available in browsertrix-behaviors 0.5.1) * tests: add tests for multiple custom behaviors --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
74831373fd
commit
442f4486d3
7 changed files with 139 additions and 6 deletions
30
crawler.js
30
crawler.js
|
@ -21,6 +21,7 @@ import { initRedis } from "./util/redis.js";
|
||||||
import { logger, errJSON } from "./util/logger.js";
|
import { logger, errJSON } from "./util/logger.js";
|
||||||
import { runWorkers } from "./util/worker.js";
|
import { runWorkers } from "./util/worker.js";
|
||||||
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
||||||
|
import { collectAllFileSources } from "./util/file_reader.js";
|
||||||
|
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
|
|
||||||
|
@ -120,6 +121,7 @@ export class Crawler {
|
||||||
|
|
||||||
this.done = false;
|
this.done = false;
|
||||||
|
|
||||||
|
this.customBehaviors = "";
|
||||||
this.behaviorLastLine = null;
|
this.behaviorLastLine = null;
|
||||||
|
|
||||||
this.browser = new Browser();
|
this.browser = new Browser();
|
||||||
|
@ -233,6 +235,10 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.params.customBehaviors) {
|
||||||
|
this.customBehaviors = this.loadCustomBehaviors(this.params.customBehaviors);
|
||||||
|
}
|
||||||
|
|
||||||
let opts = {};
|
let opts = {};
|
||||||
let redisStdio;
|
let redisStdio;
|
||||||
|
|
||||||
|
@ -347,6 +353,10 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case "error":
|
||||||
|
logger.error(message, details, "behaviorScript");
|
||||||
|
break;
|
||||||
|
|
||||||
case "debug":
|
case "debug":
|
||||||
default:
|
default:
|
||||||
logger.debug(message, details, "behaviorScript");
|
logger.debug(message, details, "behaviorScript");
|
||||||
|
@ -399,10 +409,28 @@ export class Crawler {
|
||||||
|
|
||||||
if (this.params.behaviorOpts) {
|
if (this.params.behaviorOpts) {
|
||||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid));
|
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid));
|
||||||
await this.browser.addInitScript(page, behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
await this.browser.addInitScript(page, behaviors);
|
||||||
|
|
||||||
|
const initScript = `
|
||||||
|
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
|
||||||
|
${this.customBehaviors}
|
||||||
|
self.__bx_behaviors.selectMainBehavior();
|
||||||
|
`;
|
||||||
|
|
||||||
|
await this.browser.addInitScript(page, initScript);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
loadCustomBehaviors(filename) {
|
||||||
|
let str = "";
|
||||||
|
|
||||||
|
for (const source of collectAllFileSources(filename, ".js")) {
|
||||||
|
str += `self.__bx_behaviors.load(${source});\n`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
async crawlPage(opts) {
|
async crawlPage(opts) {
|
||||||
await this.writeStats();
|
await this.writeStats();
|
||||||
|
|
||||||
|
|
23
tests/custom-behavior.test.js
Normal file
23
tests/custom-behavior.test.js
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
import child_process from "child_process";
|
||||||
|
|
||||||
|
test("test custom behaviors", async () => {
|
||||||
|
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page");
|
||||||
|
|
||||||
|
const log = res.toString();
|
||||||
|
|
||||||
|
// custom behavior ran for example.com
|
||||||
|
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true);
|
||||||
|
|
||||||
|
// but not for example.org
|
||||||
|
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false);
|
||||||
|
|
||||||
|
expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true);
|
||||||
|
|
||||||
|
// another custom behavior ran for webrecorder.net
|
||||||
|
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
|
22
tests/custom-behaviors/custom-2.js
Normal file
22
tests/custom-behaviors/custom-2.js
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
class TestBehavior2
|
||||||
|
{
|
||||||
|
static init() {
|
||||||
|
return {
|
||||||
|
state: {}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static get id() {
|
||||||
|
return "TestBehavior2";
|
||||||
|
}
|
||||||
|
|
||||||
|
static isMatch() {
|
||||||
|
return window.location.origin === "https://webrecorder.net";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async* run(ctx) {
|
||||||
|
ctx.log("In Test Behavior 2!");
|
||||||
|
yield ctx.Lib.getState(ctx, "test-stat-2");
|
||||||
|
}
|
||||||
|
}
|
22
tests/custom-behaviors/custom.js
Normal file
22
tests/custom-behaviors/custom.js
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
class TestBehavior
|
||||||
|
{
|
||||||
|
static init() {
|
||||||
|
return {
|
||||||
|
state: {}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static get id() {
|
||||||
|
return "TestBehavior";
|
||||||
|
}
|
||||||
|
|
||||||
|
static isMatch() {
|
||||||
|
return window.location.origin === "https://example.com";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async* run(ctx) {
|
||||||
|
ctx.log("In Test Behavior!");
|
||||||
|
yield ctx.Lib.getState(ctx, "test-stat");
|
||||||
|
}
|
||||||
|
}
|
|
@ -385,7 +385,12 @@ class ArgParser {
|
||||||
describe: "If set, crawler will fail with exit code 1 if any seed fails",
|
describe: "If set, crawler will fail with exit code 1 if any seed fails",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false
|
default: false
|
||||||
}
|
},
|
||||||
|
|
||||||
|
"customBehaviors": {
|
||||||
|
describe: "injects a custom behavior file or set of behavior files in a directory",
|
||||||
|
type: ["string"]
|
||||||
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
33
util/file_reader.js
Normal file
33
util/file_reader.js
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import fs from "fs";
|
||||||
|
import path from "path";
|
||||||
|
|
||||||
|
const MAX_DEPTH = 2;
|
||||||
|
|
||||||
|
export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
|
||||||
|
const resolvedPath = path.resolve(fileOrDir);
|
||||||
|
|
||||||
|
if (depth >= MAX_DEPTH) {
|
||||||
|
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const stat = fs.statSync(resolvedPath);
|
||||||
|
|
||||||
|
if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) {
|
||||||
|
const contents = fs.readFileSync(resolvedPath);
|
||||||
|
return [`/* src: ${resolvedPath} */\n\n${contents}`];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stat.isDirectory) {
|
||||||
|
const files = fs.readdirSync(resolvedPath);
|
||||||
|
return files.reduce((acc, next) => {
|
||||||
|
const nextPath = path.join(fileOrDir, next);
|
||||||
|
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
|
||||||
|
}, []);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (depth === 0) {
|
||||||
|
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
|
@ -1147,10 +1147,10 @@ browserslist@^4.21.3:
|
||||||
node-releases "^2.0.6"
|
node-releases "^2.0.6"
|
||||||
update-browserslist-db "^1.0.9"
|
update-browserslist-db "^1.0.9"
|
||||||
|
|
||||||
browsertrix-behaviors@^0.5.0-beta.0:
|
browsertrix-behaviors@^0.5.1:
|
||||||
version "0.5.0-beta.0"
|
version "0.5.1"
|
||||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.0-beta.0.tgz#d1a7c35cda31d740a374df1e833f36bd1890768d"
|
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.1.tgz#c4756b349dcabd23e25f851cec804d92e94eb63b"
|
||||||
integrity sha512-RQMQlbV4OBAzYyhTI7imoem8p4MTj2XSDzlIZvA5sC5U89OMnJ0VM5KBAJzET3PUJkQlUQEOTiXtnsnodHXTUQ==
|
integrity sha512-cNSSpQyQT73Y5NcBn2PFDkZM2ptxHVVcqxstryvtzZNOW9gGqzJlLPo8tmCBY00JHrMyn5rm8qImbFglcG/DKg==
|
||||||
|
|
||||||
bser@2.1.1:
|
bser@2.1.1:
|
||||||
version "2.1.1"
|
version "2.1.1"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue