| 
									
										
										
										
											2024-06-13 17:18:06 -07:00
										 |  |  | import {exec, execSync} from "child_process"; | 
					
						
							|  |  |  | import fs from "fs"; | 
					
						
							|  |  |  | import { Redis } from "ioredis"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | function sleep(ms) { | 
					
						
							|  |  |  |   return new Promise((resolve) => setTimeout(resolve, ms)); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | let redisId; | 
					
						
							|  |  |  | let crawler1, crawler2; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | beforeAll(() => { | 
					
						
							|  |  |  |   fs.rmSync("./test-crawls/collections/shared-crawler-1", { recursive: true, force: true }); | 
					
						
							|  |  |  |   fs.rmSync("./test-crawls/collections/shared-crawler-2", { recursive: true, force: true }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   execSync("docker network create crawl"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   redisId = execSync("docker run --rm --network=crawl -p 37379:6379 --name redis -d redis"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   crawler1 = runCrawl("crawler-1"); | 
					
						
							|  |  |  |   crawler2 = runCrawl("crawler-2"); | 
					
						
							|  |  |  | }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | afterAll(async () => { | 
					
						
							|  |  |  |   execSync(`docker kill ${redisId}`); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   await sleep(3000); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   await Promise.allSettled([crawler1, crawler2]); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   execSync("docker network rm crawl"); | 
					
						
							|  |  |  | }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | function runCrawl(name) { | 
					
						
							| 
									
										
										
										
											2024-09-05 13:28:49 -07:00
										 |  |  |   const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=crawl --hostname=${name} webrecorder/browsertrix-crawler crawl --url https://www.webrecorder.net/ --limit 4 --exclude community --collection shared-${name} --crawlId testcrawl --redisStoreUrl redis://redis:6379`); | 
					
						
							| 
									
										
										
										
											2024-06-13 17:18:06 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  |   return new Promise((resolve) => { | 
					
						
							|  |  |  |     crawler.on("exit", (code) => { | 
					
						
							|  |  |  |       resolve(code); | 
					
						
							|  |  |  |     }); | 
					
						
							|  |  |  |   }); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | test("run crawlers with external redis", async () => { | 
					
						
							|  |  |  |   const redis = new Redis("redis://127.0.0.1:37379/0", { lazyConnect: true, retryStrategy: () => null }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   await sleep(3000); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   await redis.connect({ maxRetriesPerRequest: 50 }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   let count = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   while (true) { | 
					
						
							|  |  |  |     try { | 
					
						
							|  |  |  |       const values = await redis.hgetall("testcrawl:status"); | 
					
						
							|  |  |  |       expect(values["crawler-1"]).toBe("running"); | 
					
						
							|  |  |  |       expect(values["crawler-2"]).toBe("running"); | 
					
						
							|  |  |  |       break; | 
					
						
							|  |  |  |     } catch (e) { | 
					
						
							|  |  |  |       if (count++ < 5) { | 
					
						
							|  |  |  |         await sleep(1000); | 
					
						
							|  |  |  |         continue; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       throw e; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | test("finish crawls successfully", async () => { | 
					
						
							|  |  |  |   const res = await Promise.allSettled([crawler1, crawler2]); | 
					
						
							|  |  |  |   expect(res[0].value).toBe(0); | 
					
						
							|  |  |  |   expect(res[1].value).toBe(0); | 
					
						
							| 
									
										
										
										
											2024-06-13 18:43:58 -07:00
										 |  |  | }, 180000); | 
					
						
							| 
									
										
										
										
											2024-06-13 17:18:06 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | test("ensure correct number of pages", () => { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   expect( | 
					
						
							|  |  |  |     fs.existsSync("test-crawls/collections/shared-crawler-1/pages/pages.jsonl"), | 
					
						
							|  |  |  |   ).toBe(true); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   expect( | 
					
						
							|  |  |  |     fs.existsSync("test-crawls/collections/shared-crawler-2/pages/pages.jsonl"), | 
					
						
							|  |  |  |   ).toBe(true); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const pages_1 = fs | 
					
						
							|  |  |  |     .readFileSync( | 
					
						
							|  |  |  |       "test-crawls/collections/shared-crawler-1/pages/pages.jsonl", | 
					
						
							|  |  |  |       "utf8", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     .trim() | 
					
						
							|  |  |  |     .split("\n"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const pages_2 = fs | 
					
						
							|  |  |  |     .readFileSync( | 
					
						
							|  |  |  |       "test-crawls/collections/shared-crawler-2/pages/pages.jsonl", | 
					
						
							|  |  |  |       "utf8", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     .trim() | 
					
						
							|  |  |  |     .split("\n"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // add 2 for heading in each file
 | 
					
						
							|  |  |  |   expect(pages_1.length + pages_2.length).toBe(1 + 2); | 
					
						
							|  |  |  | }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | test("ensure correct number of extraPages", () => { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   expect( | 
					
						
							|  |  |  |     fs.existsSync("test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl"), | 
					
						
							|  |  |  |   ).toBe(true); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   expect( | 
					
						
							|  |  |  |     fs.existsSync("test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl"), | 
					
						
							|  |  |  |   ).toBe(true); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const pages_1 = fs | 
					
						
							|  |  |  |     .readFileSync( | 
					
						
							|  |  |  |       "test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl", | 
					
						
							|  |  |  |       "utf8", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     .trim() | 
					
						
							|  |  |  |     .split("\n"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const pages_2 = fs | 
					
						
							|  |  |  |     .readFileSync( | 
					
						
							|  |  |  |       "test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl", | 
					
						
							|  |  |  |       "utf8", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     .trim() | 
					
						
							|  |  |  |     .split("\n"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // add 2 for heading in each file
 | 
					
						
							|  |  |  |   expect(pages_1.length + pages_2.length).toBe(3 + 2); | 
					
						
							|  |  |  | }); |