mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
feat: Add custom behavior injection (#285)
* support loading custom behaviors from a specified directory via --customBehaviors * call load() for each behavior incrementally, then call selectMainBehavior() (available in browsertrix-behaviors 0.5.1) * tests: add tests for multiple custom behaviors --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
74831373fd
commit
442f4486d3
7 changed files with 139 additions and 6 deletions
30
crawler.js
30
crawler.js
|
@ -21,6 +21,7 @@ import { initRedis } from "./util/redis.js";
|
|||
import { logger, errJSON } from "./util/logger.js";
|
||||
import { runWorkers } from "./util/worker.js";
|
||||
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
||||
import { collectAllFileSources } from "./util/file_reader.js";
|
||||
|
||||
import { Browser } from "./util/browser.js";
|
||||
|
||||
|
@ -120,6 +121,7 @@ export class Crawler {
|
|||
|
||||
this.done = false;
|
||||
|
||||
this.customBehaviors = "";
|
||||
this.behaviorLastLine = null;
|
||||
|
||||
this.browser = new Browser();
|
||||
|
@ -233,6 +235,10 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
if (this.params.customBehaviors) {
|
||||
this.customBehaviors = this.loadCustomBehaviors(this.params.customBehaviors);
|
||||
}
|
||||
|
||||
let opts = {};
|
||||
let redisStdio;
|
||||
|
||||
|
@ -347,6 +353,10 @@ export class Crawler {
|
|||
}
|
||||
break;
|
||||
|
||||
case "error":
|
||||
logger.error(message, details, "behaviorScript");
|
||||
break;
|
||||
|
||||
case "debug":
|
||||
default:
|
||||
logger.debug(message, details, "behaviorScript");
|
||||
|
@ -399,10 +409,28 @@ export class Crawler {
|
|||
|
||||
if (this.params.behaviorOpts) {
|
||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid));
|
||||
await this.browser.addInitScript(page, behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
||||
await this.browser.addInitScript(page, behaviors);
|
||||
|
||||
const initScript = `
|
||||
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
|
||||
${this.customBehaviors}
|
||||
self.__bx_behaviors.selectMainBehavior();
|
||||
`;
|
||||
|
||||
await this.browser.addInitScript(page, initScript);
|
||||
}
|
||||
}
|
||||
|
||||
loadCustomBehaviors(filename) {
|
||||
let str = "";
|
||||
|
||||
for (const source of collectAllFileSources(filename, ".js")) {
|
||||
str += `self.__bx_behaviors.load(${source});\n`;
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
async crawlPage(opts) {
|
||||
await this.writeStats();
|
||||
|
||||
|
|
23
tests/custom-behavior.test.js
Normal file
23
tests/custom-behavior.test.js
Normal file
|
@ -0,0 +1,23 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
test("test custom behaviors", async () => {
|
||||
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page");
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
// custom behavior ran for example.com
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
|
||||
// but not for example.org
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false);
|
||||
|
||||
expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
|
||||
// another custom behavior ran for webrecorder.net
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
|
||||
|
||||
|
||||
});
|
||||
|
||||
|
22
tests/custom-behaviors/custom-2.js
Normal file
22
tests/custom-behaviors/custom-2.js
Normal file
|
@ -0,0 +1,22 @@
|
|||
class TestBehavior2
|
||||
{
|
||||
static init() {
|
||||
return {
|
||||
state: {}
|
||||
};
|
||||
}
|
||||
|
||||
static get id() {
|
||||
return "TestBehavior2";
|
||||
}
|
||||
|
||||
static isMatch() {
|
||||
return window.location.origin === "https://webrecorder.net";
|
||||
}
|
||||
|
||||
|
||||
async* run(ctx) {
|
||||
ctx.log("In Test Behavior 2!");
|
||||
yield ctx.Lib.getState(ctx, "test-stat-2");
|
||||
}
|
||||
}
|
22
tests/custom-behaviors/custom.js
Normal file
22
tests/custom-behaviors/custom.js
Normal file
|
@ -0,0 +1,22 @@
|
|||
class TestBehavior
|
||||
{
|
||||
static init() {
|
||||
return {
|
||||
state: {}
|
||||
};
|
||||
}
|
||||
|
||||
static get id() {
|
||||
return "TestBehavior";
|
||||
}
|
||||
|
||||
static isMatch() {
|
||||
return window.location.origin === "https://example.com";
|
||||
}
|
||||
|
||||
|
||||
async* run(ctx) {
|
||||
ctx.log("In Test Behavior!");
|
||||
yield ctx.Lib.getState(ctx, "test-stat");
|
||||
}
|
||||
}
|
|
@ -385,7 +385,12 @@ class ArgParser {
|
|||
describe: "If set, crawler will fail with exit code 1 if any seed fails",
|
||||
type: "boolean",
|
||||
default: false
|
||||
}
|
||||
},
|
||||
|
||||
"customBehaviors": {
|
||||
describe: "injects a custom behavior file or set of behavior files in a directory",
|
||||
type: ["string"]
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
33
util/file_reader.js
Normal file
33
util/file_reader.js
Normal file
|
@ -0,0 +1,33 @@
|
|||
import fs from "fs";
|
||||
import path from "path";
|
||||
|
||||
const MAX_DEPTH = 2;
|
||||
|
||||
export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
|
||||
const resolvedPath = path.resolve(fileOrDir);
|
||||
|
||||
if (depth >= MAX_DEPTH) {
|
||||
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
|
||||
return [];
|
||||
}
|
||||
|
||||
const stat = fs.statSync(resolvedPath);
|
||||
|
||||
if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) {
|
||||
const contents = fs.readFileSync(resolvedPath);
|
||||
return [`/* src: ${resolvedPath} */\n\n${contents}`];
|
||||
}
|
||||
|
||||
if (stat.isDirectory) {
|
||||
const files = fs.readdirSync(resolvedPath);
|
||||
return files.reduce((acc, next) => {
|
||||
const nextPath = path.join(fileOrDir, next);
|
||||
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
|
||||
}, []);
|
||||
}
|
||||
|
||||
if (depth === 0) {
|
||||
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
|
||||
return [];
|
||||
}
|
||||
}
|
|
@ -1147,10 +1147,10 @@ browserslist@^4.21.3:
|
|||
node-releases "^2.0.6"
|
||||
update-browserslist-db "^1.0.9"
|
||||
|
||||
browsertrix-behaviors@^0.5.0-beta.0:
|
||||
version "0.5.0-beta.0"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.0-beta.0.tgz#d1a7c35cda31d740a374df1e833f36bd1890768d"
|
||||
integrity sha512-RQMQlbV4OBAzYyhTI7imoem8p4MTj2XSDzlIZvA5sC5U89OMnJ0VM5KBAJzET3PUJkQlUQEOTiXtnsnodHXTUQ==
|
||||
browsertrix-behaviors@^0.5.1:
|
||||
version "0.5.1"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.1.tgz#c4756b349dcabd23e25f851cec804d92e94eb63b"
|
||||
integrity sha512-cNSSpQyQT73Y5NcBn2PFDkZM2ptxHVVcqxstryvtzZNOW9gGqzJlLPo8tmCBY00JHrMyn5rm8qImbFglcG/DKg==
|
||||
|
||||
bser@2.1.1:
|
||||
version "2.1.1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue