mirror of
				https://github.com/webrecorder/browsertrix-crawler.git
				synced 2025-10-25 17:24:11 +00:00 
			
		
		
		
	
		
			
	
	
		
			175 lines
		
	
	
	
		
			5.1 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
		
		
			
		
	
	
			175 lines
		
	
	
	
		
			5.1 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
|   | import child_process from "child_process"; | ||
|  | import fs from "fs"; | ||
|  | import path from "path"; | ||
|  | import { WARCParser } from "warcio"; | ||
|  | 
 | ||
|  | const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; | ||
|  | const PDF_HTTP = PDF.replace("https", "http"); | ||
|  | 
 | ||
|  | const XML = "https://webrecorder.net/feed.xml"; | ||
|  | const XML_REDIR = "https://www.webrecorder.net/feed.xml"; | ||
|  | 
 | ||
|  | test("PDF: ensure pdf is crawled", () => { | ||
|  |   child_process.execSync( | ||
|  |     `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${PDF}" --collection crawl-pdf` | ||
|  |   ); | ||
|  | }); | ||
|  | 
 | ||
|  | test("PDF: check that individual WARCs have PDF written as 200 response", async () => { | ||
|  |   const archiveWarcLists = fs.readdirSync( | ||
|  |     "test-crawls/collections/crawl-pdf/archive", | ||
|  |   ); | ||
|  | 
 | ||
|  |   const warcName = path.join("test-crawls/collections/crawl-pdf/archive", archiveWarcLists[0]); | ||
|  | 
 | ||
|  |   const nodeStream = fs.createReadStream(warcName); | ||
|  | 
 | ||
|  |   const parser = new WARCParser(nodeStream); | ||
|  | 
 | ||
|  |   let statusCode = -1; | ||
|  | 
 | ||
|  |   for await (const record of parser) { | ||
|  |     if (record.warcType !== "response") { | ||
|  |       continue; | ||
|  |     } | ||
|  | 
 | ||
|  |     if (record.warcTargetURI === PDF) { | ||
|  |       statusCode = record.httpHeaders.statusCode; | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   expect(statusCode).toBe(200); | ||
|  | }); | ||
|  | 
 | ||
|  | test("PDF: ensure pdf with redirect is crawled", () => { | ||
|  |   child_process.execSync( | ||
|  |     `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${PDF_HTTP}" --collection crawl-pdf --generateCDX` | ||
|  |   ); | ||
|  | }); | ||
|  | 
 | ||
|  | test("PDF: check that the pages.jsonl file entry contains status code and mime type", () => { | ||
|  |   expect( | ||
|  |     fs.existsSync("test-crawls/collections/crawl-pdf/pages/pages.jsonl"), | ||
|  |   ).toBe(true); | ||
|  | 
 | ||
|  | 
 | ||
|  |   const pages = fs | ||
|  |     .readFileSync( | ||
|  |       "test-crawls/collections/crawl-pdf/pages/pages.jsonl", | ||
|  |       "utf8", | ||
|  |     ) | ||
|  |     .trim() | ||
|  |     .split("\n"); | ||
|  | 
 | ||
|  |   expect(pages.length).toBe(3); | ||
|  | 
 | ||
|  |   const page = JSON.parse(pages[1]); | ||
|  |   expect(page.url).toBe(PDF); | ||
|  |   expect(page.status).toBe(200); | ||
|  |   expect(page.mime).toBe("application/pdf"); | ||
|  |   expect(page.loadState).toBe(2); | ||
|  | 
 | ||
|  |   const pageH = JSON.parse(pages[2]); | ||
|  |   expect(pageH.url).toBe(PDF_HTTP); | ||
|  |   expect(pageH.status).toBe(200); | ||
|  |   expect(pageH.mime).toBe("application/pdf"); | ||
|  |   expect(pageH.loadState).toBe(2); | ||
|  | }); | ||
|  | 
 | ||
|  | test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => { | ||
|  |   const filedata = fs.readFileSync( | ||
|  |     "test-crawls/collections/crawl-pdf/indexes/index.cdxj", | ||
|  |     { encoding: "utf-8" }, | ||
|  |   ); | ||
|  | 
 | ||
|  |   const lines = filedata.trim().split("\n"); | ||
|  |   const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1); | ||
|  | 
 | ||
|  |   expect(cdxj.length).toBe(5); | ||
|  | 
 | ||
|  |   expect(cdxj[0].url).toBe(PDF_HTTP); | ||
|  |   expect(cdxj[0].status).toBe("301"); | ||
|  | 
 | ||
|  |   expect(cdxj[1].url).toBe(PDF); | ||
|  |   expect(cdxj[1].status).toBe("200"); | ||
|  |   expect(cdxj[1].mime).toBe("application/pdf"); | ||
|  | 
 | ||
|  |   expect(cdxj[2].url).toBe(PDF); | ||
|  |   expect(cdxj[2].status).toBe("200"); | ||
|  |   expect(cdxj[2].mime).toBe("application/pdf"); | ||
|  | 
 | ||
|  |   expect(cdxj[3].url).toBe("urn:pageinfo:" + PDF_HTTP); | ||
|  |   expect(cdxj[3].mime).toBe("application/json"); | ||
|  | 
 | ||
|  |   expect(cdxj[4].url).toBe("urn:pageinfo:" + PDF); | ||
|  |   expect(cdxj[4].mime).toBe("application/json"); | ||
|  | }); | ||
|  | 
 | ||
|  | test("XML: ensure with and without redirect is crawled", () => { | ||
|  |   child_process.execSync( | ||
|  |     `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${XML}" --url "${XML_REDIR}" --collection crawl-xml --generateCDX` | ||
|  |   ); | ||
|  | }); | ||
|  | 
 | ||
|  | test("XML: check pages.jsonl file entry contains status code and mime type", () => { | ||
|  |   expect( | ||
|  |     fs.existsSync("test-crawls/collections/crawl-xml/pages/pages.jsonl"), | ||
|  |   ).toBe(true); | ||
|  | 
 | ||
|  | 
 | ||
|  |   const pages = fs | ||
|  |     .readFileSync( | ||
|  |       "test-crawls/collections/crawl-xml/pages/pages.jsonl", | ||
|  |       "utf8", | ||
|  |     ) | ||
|  |     .trim() | ||
|  |     .split("\n"); | ||
|  | 
 | ||
|  |   expect(pages.length).toBe(3); | ||
|  | 
 | ||
|  |   const page = JSON.parse(pages[1]); | ||
|  |   expect(page.url).toBe(XML); | ||
|  |   expect(page.status).toBe(200); | ||
|  |   expect(page.mime).toBe("application/xml"); | ||
|  |   expect(page.loadState).toBe(2); | ||
|  | 
 | ||
|  |   const pageH = JSON.parse(pages[2]); | ||
|  |   expect(pageH.url).toBe(XML_REDIR); | ||
|  |   expect(pageH.status).toBe(200); | ||
|  |   expect(pageH.mime).toBe("application/xml"); | ||
|  |   expect(pageH.loadState).toBe(2); | ||
|  | }); | ||
|  | 
 | ||
|  | test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinfo entries", () => { | ||
|  |   const filedata = fs.readFileSync( | ||
|  |     "test-crawls/collections/crawl-xml/indexes/index.cdxj", | ||
|  |     { encoding: "utf-8" }, | ||
|  |   ); | ||
|  | 
 | ||
|  |   const lines = filedata.trim().split("\n"); | ||
|  |   const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1); | ||
|  | 
 | ||
|  |   expect(cdxj.length).toBe(6); | ||
|  | 
 | ||
|  |   expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico"); | ||
|  | 
 | ||
|  |   expect(cdxj[1].url).toBe(XML); | ||
|  |   expect(cdxj[1].status).toBe("200"); | ||
|  |   expect(cdxj[1].mime).toBe("application/xml"); | ||
|  | 
 | ||
|  |   expect(cdxj[2].url).toBe(XML); | ||
|  |   expect(cdxj[2].status).toBe("200"); | ||
|  |   expect(cdxj[2].mime).toBe("application/xml"); | ||
|  | 
 | ||
|  |   expect(cdxj[3].url).toBe(XML_REDIR); | ||
|  |   expect(cdxj[3].status).toBe("301"); | ||
|  | 
 | ||
|  |   expect(cdxj[4].url).toBe("urn:pageinfo:" + XML); | ||
|  |   expect(cdxj[4].mime).toBe("application/json"); | ||
|  | 
 | ||
|  |   expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR); | ||
|  |   expect(cdxj[5].mime).toBe("application/json"); | ||
|  | }); | ||
|  | 
 | ||
|  | 
 |