2024-02-16 14:36:32 -08:00
|
|
|
import child_process from "child_process";
|
|
|
|
import fs from "fs";
|
|
|
|
import path from "path";
|
|
|
|
import { WARCParser } from "warcio";
|
|
|
|
|
|
|
|
test("run warc and ensure pageinfo records contain the correct resources", async () => {
|
|
|
|
child_process.execSync(
|
2024-10-31 10:24:58 -07:00
|
|
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC",
|
2024-02-16 14:36:32 -08:00
|
|
|
);
|
|
|
|
|
|
|
|
const filename = path.join(
|
|
|
|
"test-crawls",
|
|
|
|
"collections",
|
|
|
|
"page-info-test",
|
|
|
|
"page-info-test_0.warc.gz",
|
|
|
|
);
|
|
|
|
|
|
|
|
const nodeStream = fs.createReadStream(filename);
|
|
|
|
|
|
|
|
const parser = new WARCParser(nodeStream);
|
|
|
|
|
|
|
|
let foundIndex = false;
|
|
|
|
let foundAbout = false;
|
2024-03-07 08:35:53 -08:00
|
|
|
let foundInvalid = false;
|
2024-02-16 14:36:32 -08:00
|
|
|
|
|
|
|
for await (const record of parser) {
|
2025-05-19 18:59:52 -07:00
|
|
|
if (record.warcType === "response" &&
|
|
|
|
(record.warcTargetURI === "https://old.webrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) {
|
|
|
|
expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3");
|
|
|
|
}
|
|
|
|
|
2024-02-16 14:36:32 -08:00
|
|
|
if (
|
|
|
|
!foundIndex &&
|
2024-10-31 10:24:58 -07:00
|
|
|
record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/"
|
2024-02-16 14:36:32 -08:00
|
|
|
) {
|
|
|
|
foundIndex = true;
|
|
|
|
const text = await record.contentText();
|
|
|
|
validateResourcesIndex(JSON.parse(text));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (
|
|
|
|
!foundAbout &&
|
2024-10-31 10:24:58 -07:00
|
|
|
record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/about"
|
2024-02-16 14:36:32 -08:00
|
|
|
) {
|
|
|
|
foundAbout = true;
|
|
|
|
const text = await record.contentText();
|
|
|
|
validateResourcesAbout(JSON.parse(text));
|
|
|
|
}
|
2024-03-07 08:35:53 -08:00
|
|
|
|
|
|
|
if (
|
|
|
|
!foundInvalid &&
|
|
|
|
record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/"
|
|
|
|
) {
|
|
|
|
foundInvalid = true;
|
|
|
|
const text = await record.contentText();
|
|
|
|
validateResourcesInvalid(JSON.parse(text));
|
|
|
|
}
|
2024-02-16 14:36:32 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
expect(foundIndex).toBe(true);
|
|
|
|
expect(foundAbout).toBe(true);
|
2024-03-07 08:35:53 -08:00
|
|
|
expect(foundInvalid).toBe(true);
|
2024-02-16 14:36:32 -08:00
|
|
|
});
|
|
|
|
|
|
|
|
function validateResourcesIndex(json) {
|
|
|
|
expect(json).toHaveProperty("pageid");
|
|
|
|
expect(json).toHaveProperty("url");
|
|
|
|
expect(json).toHaveProperty("ts");
|
|
|
|
expect(json).toHaveProperty("urls");
|
2024-02-21 16:02:25 -08:00
|
|
|
expect(json.counts).toEqual({ jsErrors: 0 });
|
2024-02-16 14:36:32 -08:00
|
|
|
expect(json.urls).toEqual({
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
|
|
|
mime: "text/html",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "document",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/tools/logo-pywb.png": {
|
2024-05-22 15:45:48 -07:00
|
|
|
mime: "image/png",
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
2024-05-22 15:45:48 -07:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/brand/archivewebpage-icon-color.svg": {
|
2024-02-19 19:11:48 -08:00
|
|
|
mime: "image/svg+xml",
|
2024-05-22 15:45:48 -07:00
|
|
|
status: 200,
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/brand/browsertrix-icon-color.svg": {
|
2024-05-22 15:45:48 -07:00
|
|
|
mime: "image/svg+xml",
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/brand/browsertrixcrawler-icon-color.svg": {
|
2024-05-22 15:45:48 -07:00
|
|
|
mime: "image/svg+xml",
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/brand/replaywebpage-icon-color.svg": {
|
2024-05-22 15:45:48 -07:00
|
|
|
mime: "image/svg+xml",
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/fontawesome/all.css": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
2024-05-22 15:45:48 -07:00
|
|
|
mime: "text/css",
|
|
|
|
type: "stylesheet",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/wr-logo.svg": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
2024-05-22 15:45:48 -07:00
|
|
|
mime: "image/svg+xml",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/main.css": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
|
|
|
mime: "text/css",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "stylesheet",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "text/css", type: "stylesheet" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "text/css", type: "stylesheet" },
|
2025-09-12 13:34:41 -07:00
|
|
|
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "font/woff2", type: "font" },
|
2025-09-12 13:34:41 -07:00
|
|
|
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "font/woff2", type: "font" },
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/favicon.ico": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
|
|
|
mime: "image/vnd.microsoft.icon",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "other",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-02-16 14:36:32 -08:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
function validateResourcesAbout(json) {
|
|
|
|
expect(json).toHaveProperty("pageid");
|
|
|
|
expect(json).toHaveProperty("url");
|
|
|
|
expect(json).toHaveProperty("ts");
|
|
|
|
expect(json).toHaveProperty("urls");
|
2024-02-21 16:02:25 -08:00
|
|
|
expect(json.counts).toEqual({ jsErrors: 0 });
|
2024-02-16 14:36:32 -08:00
|
|
|
expect(json.urls).toEqual({
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/about": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
|
|
|
mime: "text/html",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "document",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/main.css": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
|
|
|
mime: "text/css",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "stylesheet",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/fontawesome/all.css": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
|
|
|
mime: "text/css",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "stylesheet",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "text/css", type: "stylesheet" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "text/css", type: "stylesheet" },
|
2024-10-31 10:24:58 -07:00
|
|
|
"https://old.webrecorder.net/assets/wr-logo.svg": {
|
2024-02-19 19:11:48 -08:00
|
|
|
status: 200,
|
|
|
|
mime: "image/svg+xml",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
2025-09-12 13:34:41 -07:00
|
|
|
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "font/woff2", type: "font" },
|
2025-09-12 13:34:41 -07:00
|
|
|
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "font/woff2", type: "font" },
|
2024-03-07 08:35:53 -08:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
function validateResourcesInvalid(json) {
|
|
|
|
expect(json).toHaveProperty("pageid");
|
|
|
|
expect(json).toHaveProperty("url");
|
|
|
|
expect(json).toHaveProperty("urls");
|
|
|
|
expect(json.counts).toEqual({ jsErrors: 0 });
|
|
|
|
expect(json.urls).toEqual({
|
|
|
|
"https://invalid.invalid/": {
|
|
|
|
status: 0,
|
|
|
|
type: "document",
|
|
|
|
error: "net::ERR_NAME_NOT_RESOLVED",
|
|
|
|
},
|
2024-02-16 14:36:32 -08:00
|
|
|
});
|
|
|
|
}
|