feat: Add custom behavior injection (#285)

* support loading custom behaviors from a specified directory via --customBehaviors
* call load() for each behavior incrementally, then call selectMainBehavior() (available in browsertrix-behaviors 0.5.1)
* tests: add tests for multiple custom behaviors

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Amani 2023-07-06 16:09:48 -04:00 committed by GitHub
parent 74831373fd
commit 442f4486d3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 139 additions and 6 deletions

View file

@ -21,6 +21,7 @@ import { initRedis } from "./util/redis.js";
import { logger, errJSON } from "./util/logger.js"; import { logger, errJSON } from "./util/logger.js";
import { runWorkers } from "./util/worker.js"; import { runWorkers } from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources } from "./util/file_reader.js";
import { Browser } from "./util/browser.js"; import { Browser } from "./util/browser.js";
@ -120,6 +121,7 @@ export class Crawler {
this.done = false; this.done = false;
this.customBehaviors = "";
this.behaviorLastLine = null; this.behaviorLastLine = null;
this.browser = new Browser(); this.browser = new Browser();
@ -233,6 +235,10 @@ export class Crawler {
} }
} }
if (this.params.customBehaviors) {
this.customBehaviors = this.loadCustomBehaviors(this.params.customBehaviors);
}
let opts = {}; let opts = {};
let redisStdio; let redisStdio;
@ -347,6 +353,10 @@ export class Crawler {
} }
break; break;
case "error":
logger.error(message, details, "behaviorScript");
break;
case "debug": case "debug":
default: default:
logger.debug(message, details, "behaviorScript"); logger.debug(message, details, "behaviorScript");
@ -399,10 +409,28 @@ export class Crawler {
if (this.params.behaviorOpts) { if (this.params.behaviorOpts) {
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid)); await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid));
await this.browser.addInitScript(page, behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`); await this.browser.addInitScript(page, behaviors);
const initScript = `
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
${this.customBehaviors}
self.__bx_behaviors.selectMainBehavior();
`;
await this.browser.addInitScript(page, initScript);
} }
} }
loadCustomBehaviors(filename) {
let str = "";
for (const source of collectAllFileSources(filename, ".js")) {
str += `self.__bx_behaviors.load(${source});\n`;
}
return str;
}
async crawlPage(opts) { async crawlPage(opts) {
await this.writeStats(); await this.writeStats();

View file

@ -0,0 +1,23 @@
import child_process from "child_process";
test("test custom behaviors", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page");
const log = res.toString();
// custom behavior ran for example.com
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true);
// but not for example.org
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false);
expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true);
// another custom behavior ran for webrecorder.net
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true);
});

View file

@ -0,0 +1,22 @@
class TestBehavior2
{
static init() {
return {
state: {}
};
}
static get id() {
return "TestBehavior2";
}
static isMatch() {
return window.location.origin === "https://webrecorder.net";
}
async* run(ctx) {
ctx.log("In Test Behavior 2!");
yield ctx.Lib.getState(ctx, "test-stat-2");
}
}

View file

@ -0,0 +1,22 @@
class TestBehavior
{
static init() {
return {
state: {}
};
}
static get id() {
return "TestBehavior";
}
static isMatch() {
return window.location.origin === "https://example.com";
}
async* run(ctx) {
ctx.log("In Test Behavior!");
yield ctx.Lib.getState(ctx, "test-stat");
}
}

View file

@ -385,7 +385,12 @@ class ArgParser {
describe: "If set, crawler will fail with exit code 1 if any seed fails", describe: "If set, crawler will fail with exit code 1 if any seed fails",
type: "boolean", type: "boolean",
default: false default: false
} },
"customBehaviors": {
describe: "injects a custom behavior file or set of behavior files in a directory",
type: ["string"]
},
}; };
} }

33
util/file_reader.js Normal file
View file

@ -0,0 +1,33 @@
import fs from "fs";
import path from "path";
const MAX_DEPTH = 2;
export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
const resolvedPath = path.resolve(fileOrDir);
if (depth >= MAX_DEPTH) {
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
return [];
}
const stat = fs.statSync(resolvedPath);
if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath);
return [`/* src: ${resolvedPath} */\n\n${contents}`];
}
if (stat.isDirectory) {
const files = fs.readdirSync(resolvedPath);
return files.reduce((acc, next) => {
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
}, []);
}
if (depth === 0) {
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
return [];
}
}

View file

@ -1147,10 +1147,10 @@ browserslist@^4.21.3:
node-releases "^2.0.6" node-releases "^2.0.6"
update-browserslist-db "^1.0.9" update-browserslist-db "^1.0.9"
browsertrix-behaviors@^0.5.0-beta.0: browsertrix-behaviors@^0.5.1:
version "0.5.0-beta.0" version "0.5.1"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.0-beta.0.tgz#d1a7c35cda31d740a374df1e833f36bd1890768d" resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.1.tgz#c4756b349dcabd23e25f851cec804d92e94eb63b"
integrity sha512-RQMQlbV4OBAzYyhTI7imoem8p4MTj2XSDzlIZvA5sC5U89OMnJ0VM5KBAJzET3PUJkQlUQEOTiXtnsnodHXTUQ== integrity sha512-cNSSpQyQT73Y5NcBn2PFDkZM2ptxHVVcqxstryvtzZNOW9gGqzJlLPo8tmCBY00JHrMyn5rm8qImbFglcG/DKg==
bser@2.1.1: bser@2.1.1:
version "2.1.1" version "2.1.1"