mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

Adds enterprise policy to always download PDF and sets download dir to /dev/null Moves policies to chromium.json and brave.json for clarity Further cleanup of non-HTML loading path: - sets downloadResponse when page load is aborted but response is actually download - sets firstResponse when first response finishes, but page doesn't fully load - logs that non-HTML pages skip all post-crawl behaviors in one place - move page extra delay to separate awaitPageExtraDelay() function, applied for all pages (while post-load delay only applied to HTML pages) --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
174 lines
5.1 KiB
JavaScript
174 lines
5.1 KiB
JavaScript
import child_process from "child_process";
|
|
import fs from "fs";
|
|
import path from "path";
|
|
import { WARCParser } from "warcio";
|
|
|
|
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
|
|
const PDF_HTTP = PDF.replace("https", "http");
|
|
|
|
const XML = "https://webrecorder.net/feed.xml";
|
|
const XML_REDIR = "https://www.webrecorder.net/feed.xml";
|
|
|
|
test("PDF: ensure pdf is crawled", () => {
|
|
child_process.execSync(
|
|
`docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${PDF}" --collection crawl-pdf`
|
|
);
|
|
});
|
|
|
|
test("PDF: check that individual WARCs have PDF written as 200 response", async () => {
|
|
const archiveWarcLists = fs.readdirSync(
|
|
"test-crawls/collections/crawl-pdf/archive",
|
|
);
|
|
|
|
const warcName = path.join("test-crawls/collections/crawl-pdf/archive", archiveWarcLists[0]);
|
|
|
|
const nodeStream = fs.createReadStream(warcName);
|
|
|
|
const parser = new WARCParser(nodeStream);
|
|
|
|
let statusCode = -1;
|
|
|
|
for await (const record of parser) {
|
|
if (record.warcType !== "response") {
|
|
continue;
|
|
}
|
|
|
|
if (record.warcTargetURI === PDF) {
|
|
statusCode = record.httpHeaders.statusCode;
|
|
}
|
|
}
|
|
|
|
expect(statusCode).toBe(200);
|
|
});
|
|
|
|
test("PDF: ensure pdf with redirect is crawled", () => {
|
|
child_process.execSync(
|
|
`docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${PDF_HTTP}" --collection crawl-pdf --generateCDX`
|
|
);
|
|
});
|
|
|
|
test("PDF: check that the pages.jsonl file entry contains status code and mime type", () => {
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/crawl-pdf/pages/pages.jsonl"),
|
|
).toBe(true);
|
|
|
|
|
|
const pages = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/crawl-pdf/pages/pages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
expect(pages.length).toBe(3);
|
|
|
|
const page = JSON.parse(pages[1]);
|
|
expect(page.url).toBe(PDF);
|
|
expect(page.status).toBe(200);
|
|
expect(page.mime).toBe("application/pdf");
|
|
expect(page.loadState).toBe(2);
|
|
|
|
const pageH = JSON.parse(pages[2]);
|
|
expect(pageH.url).toBe(PDF_HTTP);
|
|
expect(pageH.status).toBe(200);
|
|
expect(pageH.mime).toBe("application/pdf");
|
|
expect(pageH.loadState).toBe(2);
|
|
});
|
|
|
|
test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => {
|
|
const filedata = fs.readFileSync(
|
|
"test-crawls/collections/crawl-pdf/indexes/index.cdxj",
|
|
{ encoding: "utf-8" },
|
|
);
|
|
|
|
const lines = filedata.trim().split("\n");
|
|
const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
|
|
|
|
expect(cdxj.length).toBe(5);
|
|
|
|
expect(cdxj[0].url).toBe(PDF_HTTP);
|
|
expect(cdxj[0].status).toBe("301");
|
|
|
|
expect(cdxj[1].url).toBe(PDF);
|
|
expect(cdxj[1].status).toBe("200");
|
|
expect(cdxj[1].mime).toBe("application/pdf");
|
|
|
|
expect(cdxj[2].url).toBe(PDF);
|
|
expect(cdxj[2].status).toBe("200");
|
|
expect(cdxj[2].mime).toBe("application/pdf");
|
|
|
|
expect(cdxj[3].url).toBe("urn:pageinfo:" + PDF_HTTP);
|
|
expect(cdxj[3].mime).toBe("application/json");
|
|
|
|
expect(cdxj[4].url).toBe("urn:pageinfo:" + PDF);
|
|
expect(cdxj[4].mime).toBe("application/json");
|
|
});
|
|
|
|
test("XML: ensure with and without redirect is crawled", () => {
|
|
child_process.execSync(
|
|
`docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${XML}" --url "${XML_REDIR}" --collection crawl-xml --generateCDX`
|
|
);
|
|
});
|
|
|
|
test("XML: check pages.jsonl file entry contains status code and mime type", () => {
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/crawl-xml/pages/pages.jsonl"),
|
|
).toBe(true);
|
|
|
|
|
|
const pages = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/crawl-xml/pages/pages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
expect(pages.length).toBe(3);
|
|
|
|
const page = JSON.parse(pages[1]);
|
|
expect(page.url).toBe(XML);
|
|
expect(page.status).toBe(200);
|
|
expect(page.mime).toBe("application/xml");
|
|
expect(page.loadState).toBe(2);
|
|
|
|
const pageH = JSON.parse(pages[2]);
|
|
expect(pageH.url).toBe(XML_REDIR);
|
|
expect(pageH.status).toBe(200);
|
|
expect(pageH.mime).toBe("application/xml");
|
|
expect(pageH.loadState).toBe(2);
|
|
});
|
|
|
|
test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinfo entries", () => {
|
|
const filedata = fs.readFileSync(
|
|
"test-crawls/collections/crawl-xml/indexes/index.cdxj",
|
|
{ encoding: "utf-8" },
|
|
);
|
|
|
|
const lines = filedata.trim().split("\n");
|
|
const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
|
|
|
|
expect(cdxj.length).toBe(6);
|
|
|
|
expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico");
|
|
|
|
expect(cdxj[1].url).toBe(XML);
|
|
expect(cdxj[1].status).toBe("200");
|
|
expect(cdxj[1].mime).toBe("application/xml");
|
|
|
|
expect(cdxj[2].url).toBe(XML);
|
|
expect(cdxj[2].status).toBe("200");
|
|
expect(cdxj[2].mime).toBe("application/xml");
|
|
|
|
expect(cdxj[3].url).toBe(XML_REDIR);
|
|
expect(cdxj[3].status).toBe("301");
|
|
|
|
expect(cdxj[4].url).toBe("urn:pageinfo:" + XML);
|
|
expect(cdxj[4].mime).toBe("application/json");
|
|
|
|
expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR);
|
|
expect(cdxj[5].mime).toBe("application/json");
|
|
});
|
|
|
|
|