diff --git a/crawler.js b/crawler.js index 6c3b078b..43d147b7 100644 --- a/crawler.js +++ b/crawler.js @@ -21,6 +21,7 @@ import { initRedis } from "./util/redis.js"; import { logger, errJSON } from "./util/logger.js"; import { runWorkers } from "./util/worker.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; +import { collectAllFileSources } from "./util/file_reader.js"; import { Browser } from "./util/browser.js"; @@ -120,6 +121,7 @@ export class Crawler { this.done = false; + this.customBehaviors = ""; this.behaviorLastLine = null; this.browser = new Browser(); @@ -233,6 +235,10 @@ export class Crawler { } } + if (this.params.customBehaviors) { + this.customBehaviors = this.loadCustomBehaviors(this.params.customBehaviors); + } + let opts = {}; let redisStdio; @@ -347,6 +353,10 @@ export class Crawler { } break; + case "error": + logger.error(message, details, "behaviorScript"); + break; + case "debug": default: logger.debug(message, details, "behaviorScript"); @@ -399,10 +409,28 @@ export class Crawler { if (this.params.behaviorOpts) { await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid)); - await this.browser.addInitScript(page, behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`); + await this.browser.addInitScript(page, behaviors); + + const initScript = ` +self.__bx_behaviors.init(${this.params.behaviorOpts}, false); +${this.customBehaviors} +self.__bx_behaviors.selectMainBehavior(); +`; + + await this.browser.addInitScript(page, initScript); } } + loadCustomBehaviors(filename) { + let str = ""; + + for (const source of collectAllFileSources(filename, ".js")) { + str += `self.__bx_behaviors.load(${source});\n`; + } + + return str; + } + async crawlPage(opts) { await this.writeStats(); diff --git a/tests/custom-behavior.test.js b/tests/custom-behavior.test.js new file mode 100644 index 00000000..df2a8c32 --- /dev/null +++ b/tests/custom-behavior.test.js @@ -0,0 +1,23 @@ +import child_process from "child_process"; + +test("test custom behaviors", async () => { + const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page"); + + const log = res.toString(); + + // custom behavior ran for example.com + expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true); + + // but not for example.org + expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false); + + expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true); + + // another custom behavior ran for webrecorder.net + expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true); + + + +}); + + diff --git a/tests/custom-behaviors/custom-2.js b/tests/custom-behaviors/custom-2.js new file mode 100644 index 00000000..f08e31d3 --- /dev/null +++ b/tests/custom-behaviors/custom-2.js @@ -0,0 +1,22 @@ +class TestBehavior2 +{ + static init() { + return { + state: {} + }; + } + + static get id() { + return "TestBehavior2"; + } + + static isMatch() { + return window.location.origin === "https://webrecorder.net"; + } + + + async* run(ctx) { + ctx.log("In Test Behavior 2!"); + yield ctx.Lib.getState(ctx, "test-stat-2"); + } +} diff --git a/tests/custom-behaviors/custom.js b/tests/custom-behaviors/custom.js new file mode 100644 index 00000000..86358d7d --- /dev/null +++ b/tests/custom-behaviors/custom.js @@ -0,0 +1,22 @@ +class TestBehavior +{ + static init() { + return { + state: {} + }; + } + + static get id() { + return "TestBehavior"; + } + + static isMatch() { + return window.location.origin === "https://example.com"; + } + + + async* run(ctx) { + ctx.log("In Test Behavior!"); + yield ctx.Lib.getState(ctx, "test-stat"); + } +} diff --git a/util/argParser.js b/util/argParser.js index 2bd6041f..21991292 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -385,7 +385,12 @@ class ArgParser { describe: "If set, crawler will fail with exit code 1 if any seed fails", type: "boolean", default: false - } + }, + + "customBehaviors": { + describe: "injects a custom behavior file or set of behavior files in a directory", + type: ["string"] + }, }; } diff --git a/util/file_reader.js b/util/file_reader.js new file mode 100644 index 00000000..e8038675 --- /dev/null +++ b/util/file_reader.js @@ -0,0 +1,33 @@ +import fs from "fs"; +import path from "path"; + +const MAX_DEPTH = 2; + +export function collectAllFileSources(fileOrDir, ext = null, depth = 0) { + const resolvedPath = path.resolve(fileOrDir); + + if (depth >= MAX_DEPTH) { + console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`); + return []; + } + + const stat = fs.statSync(resolvedPath); + + if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) { + const contents = fs.readFileSync(resolvedPath); + return [`/* src: ${resolvedPath} */\n\n${contents}`]; + } + + if (stat.isDirectory) { + const files = fs.readdirSync(resolvedPath); + return files.reduce((acc, next) => { + const nextPath = path.join(fileOrDir, next); + return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)]; + }, []); + } + + if (depth === 0) { + console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`); + return []; + } +} diff --git a/yarn.lock b/yarn.lock index 9e1a141b..6423dba6 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1147,10 +1147,10 @@ browserslist@^4.21.3: node-releases "^2.0.6" update-browserslist-db "^1.0.9" -browsertrix-behaviors@^0.5.0-beta.0: - version "0.5.0-beta.0" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.0-beta.0.tgz#d1a7c35cda31d740a374df1e833f36bd1890768d" - integrity sha512-RQMQlbV4OBAzYyhTI7imoem8p4MTj2XSDzlIZvA5sC5U89OMnJ0VM5KBAJzET3PUJkQlUQEOTiXtnsnodHXTUQ== +browsertrix-behaviors@^0.5.1: + version "0.5.1" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.1.tgz#c4756b349dcabd23e25f851cec804d92e94eb63b" + integrity sha512-cNSSpQyQT73Y5NcBn2PFDkZM2ptxHVVcqxstryvtzZNOW9gGqzJlLPo8tmCBY00JHrMyn5rm8qImbFglcG/DKg== bser@2.1.1: version "2.1.1"