mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add option to push behavior + behavior script logs to Redis (#805)
Fixes #804 - Site-specific behaviors use behaviorScriptCustom log context (via browsertrix-behaviors 0.8.3) - Add behavior logs to redis if --logBehaviorsToRedis is set, including non-debug behaviors / behaviorsScript context and all behaviorScriptCustom logs - Noisy logs from built-in behaviors like autoscroll are now logged to debug in https://github.com/webrecorder/browsertrix-behaviors/pull/92 and so won't be pushed to Redis for newer versions of the crawler. - Updates browsertrix-behaviors to 0.8.3 and makes some changes to log format in tests accordingly. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
6898bcf7ae
commit
f83d0e8f02
7 changed files with 143 additions and 33 deletions
|
@ -18,7 +18,7 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@novnc/novnc": "1.4.0",
|
"@novnc/novnc": "1.4.0",
|
||||||
"@webrecorder/wabac": "^2.22.9",
|
"@webrecorder/wabac": "^2.22.9",
|
||||||
"browsertrix-behaviors": "^0.8.1",
|
"browsertrix-behaviors": "^0.8.3",
|
||||||
"client-zip": "^2.4.5",
|
"client-zip": "^2.4.5",
|
||||||
"css-selector-parser": "^3.0.5",
|
"css-selector-parser": "^3.0.5",
|
||||||
"fetch-socks": "^1.3.0",
|
"fetch-socks": "^1.3.0",
|
||||||
|
|
|
@ -32,7 +32,7 @@ import {
|
||||||
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
||||||
import { Screenshots } from "./util/screenshots.js";
|
import { Screenshots } from "./util/screenshots.js";
|
||||||
import { initRedis } from "./util/redis.js";
|
import { initRedis } from "./util/redis.js";
|
||||||
import { logger, formatErr, LogDetails } from "./util/logger.js";
|
import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js";
|
||||||
import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
|
import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
|
||||||
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
||||||
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
|
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
|
||||||
|
@ -401,6 +401,13 @@ export class Crawler {
|
||||||
|
|
||||||
if (this.params.logErrorsToRedis) {
|
if (this.params.logErrorsToRedis) {
|
||||||
logger.setLogErrorsToRedis(true);
|
logger.setLogErrorsToRedis(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.params.logBehaviorsToRedis) {
|
||||||
|
logger.setLogBehaviorsToRedis(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.params.logErrorsToRedis || this.params.logBehaviorsToRedis) {
|
||||||
logger.setCrawlState(this.crawlState);
|
logger.setCrawlState(this.crawlState);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -645,33 +652,58 @@ export class Crawler {
|
||||||
|
|
||||||
const logDetails = { page: pageUrl, workerid };
|
const logDetails = { page: pageUrl, workerid };
|
||||||
|
|
||||||
|
let context: LogContext = "behaviorScript";
|
||||||
|
|
||||||
if (typeof data === "string") {
|
if (typeof data === "string") {
|
||||||
message = data;
|
message = data;
|
||||||
details = logDetails;
|
details = logDetails;
|
||||||
} else {
|
} else {
|
||||||
message = type === "info" ? "Behavior log" : "Behavior debug";
|
switch (type) {
|
||||||
|
case "error":
|
||||||
|
message = "Behavior error";
|
||||||
|
break;
|
||||||
|
case "debug":
|
||||||
|
message = "Behavior debug";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
message = "Behavior log";
|
||||||
|
}
|
||||||
details =
|
details =
|
||||||
typeof data === "object"
|
typeof data === "object"
|
||||||
? { ...(data as object), ...logDetails }
|
? { ...(data as object), ...logDetails }
|
||||||
: logDetails;
|
: logDetails;
|
||||||
|
|
||||||
|
if (typeof data === "object") {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const objData = data as any;
|
||||||
|
if (objData.siteSpecific) {
|
||||||
|
context = "behaviorScriptCustom";
|
||||||
|
delete objData.siteSpecific;
|
||||||
|
}
|
||||||
|
message = objData.msg || message;
|
||||||
|
delete objData.msg;
|
||||||
|
details = { ...objData, ...logDetails };
|
||||||
|
} else {
|
||||||
|
details = logDetails;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case "info":
|
case "info":
|
||||||
behaviorLine = JSON.stringify(data);
|
behaviorLine = JSON.stringify(data);
|
||||||
if (behaviorLine !== this.behaviorLastLine) {
|
if (behaviorLine !== this.behaviorLastLine) {
|
||||||
logger.info(message, details, "behaviorScript");
|
logger.info(message, details, context);
|
||||||
this.behaviorLastLine = behaviorLine;
|
this.behaviorLastLine = behaviorLine;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "error":
|
case "error":
|
||||||
logger.error(message, details, "behaviorScript");
|
logger.error(message, details, context);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "debug":
|
case "debug":
|
||||||
default:
|
default:
|
||||||
logger.debug(message, details, "behaviorScript");
|
logger.debug(message, details, context);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -549,6 +549,12 @@ class ArgParser {
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
logBehaviorsToRedis: {
|
||||||
|
describe: "If set, write behavior script messages to redis",
|
||||||
|
type: "boolean",
|
||||||
|
default: false,
|
||||||
|
},
|
||||||
|
|
||||||
writePagesToRedis: {
|
writePagesToRedis: {
|
||||||
describe: "If set, write page objects to redis",
|
describe: "If set, write page objects to redis",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
|
|
|
@ -45,6 +45,7 @@ export const LOG_CONTEXT_TYPES = [
|
||||||
"blocking",
|
"blocking",
|
||||||
"behavior",
|
"behavior",
|
||||||
"behaviorScript",
|
"behaviorScript",
|
||||||
|
"behaviorScriptCustom",
|
||||||
"jsError",
|
"jsError",
|
||||||
"fetch",
|
"fetch",
|
||||||
"pageStatus",
|
"pageStatus",
|
||||||
|
@ -70,6 +71,7 @@ class Logger {
|
||||||
logStream: Writable | null = null;
|
logStream: Writable | null = null;
|
||||||
debugLogging = false;
|
debugLogging = false;
|
||||||
logErrorsToRedis = false;
|
logErrorsToRedis = false;
|
||||||
|
logBehaviorsToRedis = false;
|
||||||
logLevels: string[] = [];
|
logLevels: string[] = [];
|
||||||
contexts: LogContext[] = [];
|
contexts: LogContext[] = [];
|
||||||
excludeContexts: LogContext[] = [];
|
excludeContexts: LogContext[] = [];
|
||||||
|
@ -92,6 +94,10 @@ class Logger {
|
||||||
this.logErrorsToRedis = logErrorsToRedis;
|
this.logErrorsToRedis = logErrorsToRedis;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setLogBehaviorsToRedis(logBehaviorsToRedis: boolean) {
|
||||||
|
this.logBehaviorsToRedis = logBehaviorsToRedis;
|
||||||
|
}
|
||||||
|
|
||||||
setLogLevel(logLevels: string[]) {
|
setLogLevel(logLevels: string[]) {
|
||||||
this.logLevels = logLevels;
|
this.logLevels = logLevels;
|
||||||
}
|
}
|
||||||
|
@ -152,14 +158,27 @@ class Logger {
|
||||||
//
|
//
|
||||||
}
|
}
|
||||||
|
|
||||||
const toLogToRedis = ["error", "fatal"];
|
const redisErrorLogLevels = ["error", "fatal"];
|
||||||
if (
|
if (
|
||||||
this.logErrorsToRedis &&
|
this.logErrorsToRedis &&
|
||||||
this.crawlState &&
|
this.crawlState &&
|
||||||
toLogToRedis.includes(logLevel)
|
redisErrorLogLevels.includes(logLevel)
|
||||||
) {
|
) {
|
||||||
this.crawlState.logError(string).catch(() => {});
|
this.crawlState.logError(string).catch(() => {});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const redisBehaviorLogLevels = ["info", "warn", "error"];
|
||||||
|
const behaviorContexts = ["behavior", "behaviorScript"];
|
||||||
|
if (
|
||||||
|
this.logBehaviorsToRedis &&
|
||||||
|
this.crawlState &&
|
||||||
|
((behaviorContexts.includes(context) &&
|
||||||
|
redisBehaviorLogLevels.includes(logLevel)) ||
|
||||||
|
//always include behaviorScriptCustom
|
||||||
|
context === "behaviorScriptCustom")
|
||||||
|
) {
|
||||||
|
this.crawlState.logBehavior(string).catch(() => {});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info(message: string, data: unknown = {}, context: LogContext = "general") {
|
info(message: string, data: unknown = {}, context: LogContext = "general") {
|
||||||
|
|
|
@ -181,6 +181,7 @@ export class RedisCrawlState {
|
||||||
dkey: string;
|
dkey: string;
|
||||||
fkey: string;
|
fkey: string;
|
||||||
ekey: string;
|
ekey: string;
|
||||||
|
bkey: string;
|
||||||
pageskey: string;
|
pageskey: string;
|
||||||
esKey: string;
|
esKey: string;
|
||||||
esMap: string;
|
esMap: string;
|
||||||
|
@ -212,6 +213,8 @@ export class RedisCrawlState {
|
||||||
this.fkey = this.key + ":f";
|
this.fkey = this.key + ":f";
|
||||||
// crawler errors
|
// crawler errors
|
||||||
this.ekey = this.key + ":e";
|
this.ekey = this.key + ":e";
|
||||||
|
// crawler behavior script messages
|
||||||
|
this.bkey = this.key + ":b";
|
||||||
// pages
|
// pages
|
||||||
this.pageskey = this.key + ":pages";
|
this.pageskey = this.key + ":pages";
|
||||||
|
|
||||||
|
@ -931,6 +934,10 @@ return inx;
|
||||||
return await this.redis.lpush(this.ekey, error);
|
return await this.redis.lpush(this.ekey, error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async logBehavior(behaviorLog: string) {
|
||||||
|
return await this.redis.lpush(this.bkey, behaviorLog);
|
||||||
|
}
|
||||||
|
|
||||||
async writeToPagesQueue(
|
async writeToPagesQueue(
|
||||||
data: Record<string, string | number | boolean | object>,
|
data: Record<string, string | number | boolean | object>,
|
||||||
) {
|
) {
|
||||||
|
|
|
@ -1,4 +1,11 @@
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
import Redis from "ioredis";
|
||||||
|
|
||||||
|
|
||||||
|
async function sleep(time) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, time));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
test("test custom behaviors from local filepath", async () => {
|
test("test custom behaviors from local filepath", async () => {
|
||||||
const res = child_process.execSync(
|
const res = child_process.execSync(
|
||||||
|
@ -10,27 +17,21 @@ test("test custom behaviors from local filepath", async () => {
|
||||||
// custom behavior ran for specs.webrecorder.net
|
// custom behavior ran for specs.webrecorder.net
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://specs.webrecorder.net/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
|
|
||||||
// but not for example.org
|
// but not for example.org
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example.org","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(false);
|
).toBe(false);
|
||||||
|
|
||||||
expect(
|
|
||||||
log.indexOf(
|
|
||||||
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
|
|
||||||
) > 0,
|
|
||||||
).toBe(true);
|
|
||||||
|
|
||||||
// another custom behavior ran for old.webrecorder.net
|
// another custom behavior ran for old.webrecorder.net
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat-2","details":{"state":{},"behavior":"TestBehavior2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
@ -44,7 +45,7 @@ test("test custom behavior from URL", async () => {
|
||||||
|
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat-2","details":{"state":{},"behavior":"TestBehavior2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
@ -59,14 +60,14 @@ test("test mixed custom behavior sources", async () => {
|
||||||
|
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://specs.webrecorder.net/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
|
|
||||||
// test custom behavior from local file ran
|
// test custom behavior from local file ran
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat-2","details":{"state":{},"behavior":"TestBehavior2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
@ -81,27 +82,21 @@ test("test custom behaviors from git repo", async () => {
|
||||||
// custom behavior ran for specs.webrecorder.net
|
// custom behavior ran for specs.webrecorder.net
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://specs.webrecorder.net/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
|
|
||||||
// but not for example.org
|
// but not for example.org
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example.org/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(false);
|
).toBe(false);
|
||||||
|
|
||||||
expect(
|
|
||||||
log.indexOf(
|
|
||||||
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
|
|
||||||
) > 0,
|
|
||||||
).toBe(true);
|
|
||||||
|
|
||||||
// another custom behavior ran for old.webrecorder.net
|
// another custom behavior ran for old.webrecorder.net
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat-2","details":{"state":{},"behavior":"TestBehavior2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
@ -165,3 +160,54 @@ test("test crawl exits if not custom behaviors collected from local path", async
|
||||||
// logger fatal exit code
|
// logger fatal exit code
|
||||||
expect(status).toBe(17);
|
expect(status).toBe(17);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("test pushing behavior logs to redis", async () => {
|
||||||
|
child_process.execSync("docker network create crawl");
|
||||||
|
|
||||||
|
const redisId = child_process.execSync("docker run --rm --network=crawl -p 36399:6379 --name redis -d redis");
|
||||||
|
|
||||||
|
const child = child_process.exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ -e CRAWL_ID=behavior-logs-redis-test --network=crawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page --logBehaviorsToRedis");
|
||||||
|
|
||||||
|
let resolve = null;
|
||||||
|
const crawlFinished = new Promise(r => resolve = r);
|
||||||
|
|
||||||
|
child.on("exit", function () {
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
|
||||||
|
await crawlFinished;
|
||||||
|
|
||||||
|
const redis = new Redis("redis://127.0.0.1:36399/0", { lazyConnect: true, retryStrategy: () => null });
|
||||||
|
|
||||||
|
await sleep(3000);
|
||||||
|
|
||||||
|
await redis.connect({ maxRetriesPerRequest: 50 });
|
||||||
|
|
||||||
|
let customLogLineCount = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const res = await redis.lpop("behavior-logs-redis-test:b");
|
||||||
|
if (!res) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const json = JSON.parse(res);
|
||||||
|
expect(json).toHaveProperty("timestamp");
|
||||||
|
expect(json.logLevel).toBe("info");
|
||||||
|
expect(["behavior", "behaviorScript", "behaviorScriptCustom"]).toContain(json.context)
|
||||||
|
|
||||||
|
if (json.context === "behaviorScriptCustom") {
|
||||||
|
expect(["test-stat", "test-stat-2", "done!"]).toContain(json.message);
|
||||||
|
expect(["TestBehavior", "TestBehavior2"]).toContain(json.details.behavior);
|
||||||
|
expect(["https://specs.webrecorder.net/", "https://old.webrecorder.net/"]).toContain(json.details.page);
|
||||||
|
customLogLineCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(customLogLineCount).toEqual(4);
|
||||||
|
|
||||||
|
child_process.execSync(`docker kill ${redisId}`);
|
||||||
|
|
||||||
|
await sleep(3000);
|
||||||
|
|
||||||
|
child_process.execSync("docker network rm crawl");
|
||||||
|
});
|
||||||
|
|
|
@ -1460,10 +1460,10 @@ browserslist@^4.24.0:
|
||||||
node-releases "^2.0.18"
|
node-releases "^2.0.18"
|
||||||
update-browserslist-db "^1.1.1"
|
update-browserslist-db "^1.1.1"
|
||||||
|
|
||||||
browsertrix-behaviors@^0.8.1:
|
browsertrix-behaviors@^0.8.3:
|
||||||
version "0.8.1"
|
version "0.8.3"
|
||||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.8.1.tgz#21d2726d32ca23f94efd2c3e9a0b8b98c42482bb"
|
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.8.3.tgz#3710f8d778c09ab4f46d5b7e1d4ebbfcbc1a8f02"
|
||||||
integrity sha512-bG5KA/9fF60slakycvbJZ4COo/nD9GKflNEhEk3yQQTy3C0ikRVOpueG9XfvAgq2r4Ji1NMY9pesJRpQqtHKXg==
|
integrity sha512-ie+fsa45K1kZ2PWvt8ISObgFto9AXzbDB92ts+vgvVJ8erRA9pUKUyOVnK/B/qdNbYO0EQJiSP24GRqNp702tQ==
|
||||||
dependencies:
|
dependencies:
|
||||||
query-selector-shadow-dom "^1.0.1"
|
query-selector-shadow-dom "^1.0.1"
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue