mirror of
				https://github.com/webrecorder/browsertrix-crawler.git
				synced 2025-10-26 09:44:10 +00:00 
			
		
		
		
	 c796996664
			
		
	
	
		c796996664
		
			
		
	
	
	
	
		
			
			New Feature: - support 'flow behavior' from JSON specification - detect .json files via --customBehaviors - log behavior progress while running - logging tweaks (via browsertrix-behaviors 0.8.4) to limit logging for custom behaviors - differentiate logging for iframes, move more behavior messages to debug - move initCrawlState() to happen earlier to ensure Redis logging can happen in case of fatal errors - docs to be added in separate follow-up PR
		
			
				
	
	
		
			212 lines
		
	
	
	
		
			8.2 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			212 lines
		
	
	
	
		
			8.2 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| import child_process from "child_process";
 | |
| import Redis from "ioredis";
 | |
| 
 | |
| 
 | |
| async function sleep(time) {
 | |
|   await new Promise((resolve) => setTimeout(resolve, time));
 | |
| }
 | |
| 
 | |
| 
 | |
| test("test custom behaviors from local filepath", async () => {
 | |
|   const res = child_process.execSync(
 | |
|     "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
 | |
|   );
 | |
| 
 | |
|   const log = res.toString();
 | |
| 
 | |
|   // custom behavior ran for specs.webrecorder.net
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://specs.webrecorder.net/","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(true);
 | |
| 
 | |
|   // but not for example.org
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example.org","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(false);
 | |
| 
 | |
|   // another custom behavior ran for old.webrecorder.net
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat-2","details":{"state":{},"behavior":"TestBehavior2","page":"https://old.webrecorder.net/","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(true);
 | |
| });
 | |
| 
 | |
| test("test custom behavior from URL", async () => {
 | |
|   const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --scopeType page");
 | |
| 
 | |
|   const log = res.toString();
 | |
| 
 | |
|   expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
 | |
| 
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat-2","details":{"state":{},"behavior":"TestBehavior2","page":"https://old.webrecorder.net/","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(true);
 | |
| });
 | |
| 
 | |
| test("test mixed custom behavior sources", async () => {
 | |
|   const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page");
 | |
| 
 | |
|   const log = res.toString();
 | |
| 
 | |
|   // test custom behavior from url ran
 | |
|   expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
 | |
| 
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://specs.webrecorder.net/","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(true);
 | |
| 
 | |
|   // test custom behavior from local file ran
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat-2","details":{"state":{},"behavior":"TestBehavior2","page":"https://old.webrecorder.net/","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(true);
 | |
| });
 | |
| 
 | |
| test("test custom behaviors from git repo", async () => {
 | |
|   const res = child_process.execSync(
 | |
|     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors \"git+https://github.com/webrecorder/browsertrix-crawler.git?branch=main&path=tests/custom-behaviors\" --scopeType page",
 | |
|   );
 | |
| 
 | |
|   const log = res.toString();
 | |
| 
 | |
|   // custom behavior ran for specs.webrecorder.net
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://specs.webrecorder.net/","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(true);
 | |
| 
 | |
|   // but not for example.org
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example.org/","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(false);
 | |
| 
 | |
|   // another custom behavior ran for old.webrecorder.net
 | |
|   expect(
 | |
|     log.indexOf(
 | |
|       '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat-2","details":{"state":{},"behavior":"TestBehavior2","page":"https://old.webrecorder.net/","workerid":0}}',
 | |
|     ) > 0,
 | |
|   ).toBe(true);
 | |
| });
 | |
| 
 | |
| test("test invalid behavior exit", async () => {
 | |
|   let status = 0;
 | |
| 
 | |
|   try {
 | |
|     child_process.execSync(
 | |
|       "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/invalid-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/invalid-export.js --scopeType page",
 | |
|     );
 | |
|   } catch (e) {
 | |
|     status = e.status;
 | |
|   }
 | |
| 
 | |
|   // logger fatal exit code
 | |
|   expect(status).toBe(17);
 | |
| });
 | |
| 
 | |
| test("test crawl exits if behavior not fetched from url", async () => {
 | |
|   let status = 0;
 | |
| 
 | |
|   try {
 | |
|     child_process.execSync(
 | |
|       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors https://webrecorder.net/doesntexist/custombehavior.js --scopeType page",
 | |
|     );
 | |
|   } catch (e) {
 | |
|     status = e.status;
 | |
|   }
 | |
| 
 | |
|   // logger fatal exit code
 | |
|   expect(status).toBe(17);
 | |
| });
 | |
| 
 | |
| test("test crawl exits if behavior not fetched from git repo", async () => {
 | |
|   let status = 0;
 | |
| 
 | |
|   try {
 | |
|     child_process.execSync(
 | |
|       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors git+https://github.com/webrecorder/doesntexist --scopeType page",
 | |
|     );
 | |
|   } catch (e) {
 | |
|     status = e.status;
 | |
|   }
 | |
| 
 | |
|   // logger fatal exit code
 | |
|   expect(status).toBe(17);
 | |
| });
 | |
| 
 | |
| test("test crawl exits if not custom behaviors collected from local path", async () => {
 | |
|   let status = 0;
 | |
| 
 | |
|   try {
 | |
|     child_process.execSync(
 | |
|       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors /custom-behaviors/doesntexist --scopeType page",
 | |
|     );
 | |
|   } catch (e) {
 | |
|     status = e.status;
 | |
|   }
 | |
| 
 | |
|   // logger fatal exit code
 | |
|   expect(status).toBe(17);
 | |
| });
 | |
| 
 | |
| test("test pushing behavior logs to redis", async () => {
 | |
|   child_process.execSync("docker network create crawl");
 | |
| 
 | |
|   const redisId = child_process.execSync("docker run --rm --network=crawl -p 36399:6379 --name redis -d redis");
 | |
| 
 | |
|   const child = child_process.exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ -e CRAWL_ID=behavior-logs-redis-test --network=crawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page --logBehaviorsToRedis");
 | |
| 
 | |
|   let resolve = null;
 | |
|   const crawlFinished = new Promise(r => resolve = r);
 | |
| 
 | |
|   child.on("exit", function () {
 | |
|     resolve();
 | |
|   });
 | |
| 
 | |
|   await crawlFinished;
 | |
| 
 | |
|   const redis = new Redis("redis://127.0.0.1:36399/0", { lazyConnect: true, retryStrategy: () => null });
 | |
| 
 | |
|   await sleep(3000);
 | |
| 
 | |
|   await redis.connect({ maxRetriesPerRequest: 50 });
 | |
| 
 | |
|   let customLogLineCount = 0;
 | |
| 
 | |
|   while (true) {
 | |
|     const res = await redis.lpop("behavior-logs-redis-test:b");
 | |
|     if (!res) {
 | |
|       break;
 | |
|     }
 | |
|     const json = JSON.parse(res);
 | |
|     expect(json).toHaveProperty("timestamp");
 | |
|     expect(json.logLevel).toBe("info");
 | |
|     expect(["behavior", "behaviorScript", "behaviorScriptCustom"]).toContain(json.context)
 | |
| 
 | |
|     if (json.context === "behaviorScriptCustom") {
 | |
|       expect(["TestBehavior", "TestBehavior2"]).toContain(json.details.behavior);
 | |
|       expect(["https://specs.webrecorder.net/", "https://old.webrecorder.net/"]).toContain(json.details.page);
 | |
|       customLogLineCount++;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   expect(customLogLineCount).toEqual(4);
 | |
| 
 | |
|   child_process.execSync(`docker kill ${redisId}`);
 | |
| 
 | |
|   await sleep(3000);
 | |
| 
 | |
|   child_process.execSync("docker network rm crawl");
 | |
| });
 |