| 
									
										
										
										
											2022-10-24 15:30:10 +02:00
										 |  |  | import fs from "fs"; | 
					
						
							|  |  |  | import zlib from "zlib"; | 
					
						
							| 
									
										
										
										
											2024-05-22 15:47:05 -07:00
										 |  |  | import path from "path"; | 
					
						
							| 
									
										
										
										
											2022-10-24 15:30:10 +02:00
										 |  |  | import child_process from "child_process"; | 
					
						
							| 
									
										
										
										
											2021-07-07 18:56:52 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-22 15:47:05 -07:00
										 |  |  | test("run crawl", async() => { | 
					
						
							|  |  |  |   let success = false; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |   try { | 
					
						
							| 
									
										
										
										
											2021-07-07 18:56:52 -04:00
										 |  |  |     const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8"); | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |     const proc = child_process.execSync( | 
					
						
							|  |  |  |       "docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", | 
					
						
							|  |  |  |       { input: configYaml, stdin: "inherit", encoding: "utf8" }, | 
					
						
							|  |  |  |     ); | 
					
						
							| 
									
										
										
										
											2021-07-07 18:56:52 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-26 13:05:13 -07:00
										 |  |  |     //console.log(proc);
 | 
					
						
							| 
									
										
										
										
											2024-05-22 15:47:05 -07:00
										 |  |  |     success = true; | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |   } catch (error) { | 
					
						
							| 
									
										
										
										
											2021-07-07 18:56:52 -04:00
										 |  |  |     console.log(error); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-22 15:47:05 -07:00
										 |  |  |   expect(success).toBe(true); | 
					
						
							|  |  |  | }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | test("check that the warcinfo for individual WARC is as expected", async () => { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   let filename = ""; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   for (const name of warcs) { | 
					
						
							|  |  |  |     if (name.startsWith("rec-")) { | 
					
						
							|  |  |  |       filename = path.join("test-crawls/collections/warcinfo/archive/", name); | 
					
						
							|  |  |  |       break; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const warcData = fs.readFileSync(filename); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const data = zlib.gunzipSync(warcData); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const string = data.toString("utf8"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   expect(string.indexOf("operator: test")).toBeGreaterThan(-1); | 
					
						
							|  |  |  |   expect(string.indexOf("host: hostname")).toBeGreaterThan(-1); | 
					
						
							|  |  |  |   expect( | 
					
						
							|  |  |  |     string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/), | 
					
						
							|  |  |  |   ).not.toEqual(null); | 
					
						
							|  |  |  |   expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1); | 
					
						
							|  |  |  | }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | test("check that the warcinfo for combined WARC file is as expected", async () => { | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |   const warcData = fs.readFileSync( | 
					
						
							|  |  |  |     "test-crawls/collections/warcinfo/warcinfo_0.warc.gz", | 
					
						
							|  |  |  |   ); | 
					
						
							| 
									
										
										
										
											2021-07-07 18:56:52 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |   const data = zlib.gunzipSync(warcData); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const string = data.toString("utf8"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   expect(string.indexOf("operator: test")).toBeGreaterThan(-1); | 
					
						
							|  |  |  |   expect(string.indexOf("host: hostname")).toBeGreaterThan(-1); | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |   expect( | 
					
						
							|  |  |  |     string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/), | 
					
						
							|  |  |  |   ).not.toEqual(null); | 
					
						
							| 
									
										
										
										
											2024-04-18 21:52:24 -07:00
										 |  |  |   expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1); | 
					
						
							| 
									
										
										
										
											2021-07-07 18:56:52 -04:00
										 |  |  | }); |