| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  | import child_process from "child_process"; | 
					
						
							|  |  |  | import fs from "fs"; | 
					
						
							|  |  |  | import path from "path"; | 
					
						
							|  |  |  | import { WARCParser } from "warcio"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | test("run warc and ensure pageinfo records contain the correct resources", async () => { | 
					
						
							|  |  |  |   child_process.execSync( | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC", | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  |   ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const filename = path.join( | 
					
						
							|  |  |  |     "test-crawls", | 
					
						
							|  |  |  |     "collections", | 
					
						
							|  |  |  |     "page-info-test", | 
					
						
							|  |  |  |     "page-info-test_0.warc.gz", | 
					
						
							|  |  |  |   ); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const nodeStream = fs.createReadStream(filename); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const parser = new WARCParser(nodeStream); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   let foundIndex = false; | 
					
						
							|  |  |  |   let foundAbout = false; | 
					
						
							| 
									
										
										
										
											2024-03-07 08:35:53 -08:00
										 |  |  |   let foundInvalid = false; | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |   for await (const record of parser) { | 
					
						
							|  |  |  |     if ( | 
					
						
							|  |  |  |       !foundIndex && | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |       record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/" | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  |     ) { | 
					
						
							|  |  |  |       foundIndex = true; | 
					
						
							|  |  |  |       const text = await record.contentText(); | 
					
						
							|  |  |  |       validateResourcesIndex(JSON.parse(text)); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( | 
					
						
							|  |  |  |       !foundAbout && | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |       record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/about" | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  |     ) { | 
					
						
							|  |  |  |       foundAbout = true; | 
					
						
							|  |  |  |       const text = await record.contentText(); | 
					
						
							|  |  |  |       validateResourcesAbout(JSON.parse(text)); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-03-07 08:35:53 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if ( | 
					
						
							|  |  |  |       !foundInvalid && | 
					
						
							|  |  |  |       record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/" | 
					
						
							|  |  |  |     ) { | 
					
						
							|  |  |  |       foundInvalid = true; | 
					
						
							|  |  |  |       const text = await record.contentText(); | 
					
						
							|  |  |  |       validateResourcesInvalid(JSON.parse(text)); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   expect(foundIndex).toBe(true); | 
					
						
							|  |  |  |   expect(foundAbout).toBe(true); | 
					
						
							| 
									
										
										
										
											2024-03-07 08:35:53 -08:00
										 |  |  |   expect(foundInvalid).toBe(true); | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  | }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | function validateResourcesIndex(json) { | 
					
						
							|  |  |  |   expect(json).toHaveProperty("pageid"); | 
					
						
							|  |  |  |   expect(json).toHaveProperty("url"); | 
					
						
							|  |  |  |   expect(json).toHaveProperty("ts"); | 
					
						
							|  |  |  |   expect(json).toHaveProperty("urls"); | 
					
						
							| 
									
										
										
										
											2024-02-21 16:02:25 -08:00
										 |  |  |   expect(json.counts).toEqual({ jsErrors: 0 }); | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  |   expect(json.urls).toEqual({ | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							|  |  |  |       mime: "text/html", | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "document", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/tools/logo-pywb.png": { | 
					
						
							| 
									
										
										
										
											2024-05-22 15:45:48 -07:00
										 |  |  |       mime: "image/png", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							| 
									
										
										
										
											2024-05-22 15:45:48 -07:00
										 |  |  |       type: "image", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/brand/archivewebpage-icon-color.svg": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       mime: "image/svg+xml", | 
					
						
							| 
									
										
										
										
											2024-05-22 15:45:48 -07:00
										 |  |  |       status: 200, | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "image", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/brand/browsertrix-icon-color.svg": { | 
					
						
							| 
									
										
										
										
											2024-05-22 15:45:48 -07:00
										 |  |  |       mime: "image/svg+xml", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "image", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/brand/browsertrixcrawler-icon-color.svg": { | 
					
						
							| 
									
										
										
										
											2024-05-22 15:45:48 -07:00
										 |  |  |       mime: "image/svg+xml", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "image", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/brand/replaywebpage-icon-color.svg": { | 
					
						
							| 
									
										
										
										
											2024-05-22 15:45:48 -07:00
										 |  |  |       mime: "image/svg+xml", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "image", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/fontawesome/all.css": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							| 
									
										
										
										
											2024-05-22 15:45:48 -07:00
										 |  |  |       mime: "text/css", | 
					
						
							|  |  |  |       type: "stylesheet", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/wr-logo.svg": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							| 
									
										
										
										
											2024-05-22 15:45:48 -07:00
										 |  |  |       mime: "image/svg+xml", | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "image", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/main.css": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							|  |  |  |       mime: "text/css", | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "stylesheet", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							|  |  |  |     "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap": | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       { status: 200, mime: "text/css", type: "stylesheet" }, | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     "https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap": | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       { status: 200, mime: "text/css", type: "stylesheet" }, | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     "https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       { status: 200, mime: "font/woff2", type: "font" }, | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       { status: 200, mime: "font/woff2", type: "font" }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/favicon.ico": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							|  |  |  |       mime: "image/vnd.microsoft.icon", | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "other", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  |   }); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | function validateResourcesAbout(json) { | 
					
						
							|  |  |  |   expect(json).toHaveProperty("pageid"); | 
					
						
							|  |  |  |   expect(json).toHaveProperty("url"); | 
					
						
							|  |  |  |   expect(json).toHaveProperty("ts"); | 
					
						
							|  |  |  |   expect(json).toHaveProperty("urls"); | 
					
						
							| 
									
										
										
										
											2024-02-21 16:02:25 -08:00
										 |  |  |   expect(json.counts).toEqual({ jsErrors: 0 }); | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  |   expect(json.urls).toEqual({ | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/about": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							|  |  |  |       mime: "text/html", | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "document", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/main.css": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							|  |  |  |       mime: "text/css", | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "stylesheet", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/fontawesome/all.css": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							|  |  |  |       mime: "text/css", | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "stylesheet", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							|  |  |  |     "https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap": | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       { status: 200, mime: "text/css", type: "stylesheet" }, | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap": | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       { status: 200, mime: "text/css", type: "stylesheet" }, | 
					
						
							| 
									
										
										
										
											2024-10-31 10:24:58 -07:00
										 |  |  |     "https://old.webrecorder.net/assets/wr-logo.svg": { | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |       status: 200, | 
					
						
							|  |  |  |       mime: "image/svg+xml", | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       type: "image", | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     }, | 
					
						
							|  |  |  |     "https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       { status: 200, mime: "font/woff2", type: "font" }, | 
					
						
							| 
									
										
										
										
											2024-02-19 19:11:48 -08:00
										 |  |  |     "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": | 
					
						
							| 
									
										
										
										
											2024-03-04 18:10:45 -08:00
										 |  |  |       { status: 200, mime: "font/woff2", type: "font" }, | 
					
						
							| 
									
										
										
										
											2024-03-07 08:35:53 -08:00
										 |  |  |   }); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | function validateResourcesInvalid(json) { | 
					
						
							|  |  |  |   expect(json).toHaveProperty("pageid"); | 
					
						
							|  |  |  |   expect(json).toHaveProperty("url"); | 
					
						
							|  |  |  |   expect(json).toHaveProperty("urls"); | 
					
						
							|  |  |  |   expect(json.counts).toEqual({ jsErrors: 0 }); | 
					
						
							|  |  |  |   expect(json.urls).toEqual({ | 
					
						
							|  |  |  |     "https://invalid.invalid/": { | 
					
						
							|  |  |  |       status: 0, | 
					
						
							|  |  |  |       type: "document", | 
					
						
							|  |  |  |       error: "net::ERR_NAME_NOT_RESOLVED", | 
					
						
							|  |  |  |     }, | 
					
						
							| 
									
										
										
										
											2024-02-16 14:36:32 -08:00
										 |  |  |   }); | 
					
						
							|  |  |  | } |