browsertrix-crawler/tests/warcinfo.test.js
Ilya Kreymer 877d9f5b44
Use new browser-based archiving mechanism instead of pywb proxy (#424)
Major refactoring of Browsertrix Crawler to native capture network traffic to WARC files
via the Chrome Debug Protocol (CDP). Allows for more flexibility and accuracy when dealing
with HTTP/2.x sites and avoids a MITM proxy. Addresses #343 

Changes include:
- Recorder class for capture CDP network traffic for each page.
- Handling requests from service workers via matching active frames, skipping unrelated requests outside the page (from background pages, etc..)
- WARC writing support via TS-based warcio.js library.
- Generates single WARC file per worker (still need to add size rollover).
- Request interception via Fetch.requestPaused
- Rule-based rewriting response support (via wabac.js), using Fetch.getResponseBody() / Fetch.fulfillRequest()
- Streaming responses via three methods: inline response fetch via Fetch.takeResponseBodyAsStream, 
async loading via browser network stack with Network.loadNetworkResource() and node-based async fetch
via fetch()
- Direct async fetch() capture of non-HTML URLs
- Awaiting for all requests to finish before moving on to next page, upto page timeout.
- Experimental: generate CDXJ on-the-fly as WARC is being written (not yet in use).
- removed pywb, using cdxj-indexer for --generateCDX option.
2023-11-07 21:38:50 -08:00

28 lines
1 KiB
JavaScript

import fs from "fs";
import zlib from "zlib";
import child_process from "child_process";
test("check that the warcinfo file works as expected on the command line", async () => {
try{
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
console.log(proc);
}
catch (error) {
console.log(error);
}
const warcData = fs.readFileSync("test-crawls/collections/warcinfo/warcinfo_0.warc.gz");
const data = zlib.gunzipSync(warcData);
const string = data.toString("utf8");
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null);
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
});