2024-02-16 14:36:32 -08:00
|
|
|
import child_process from "child_process";
|
|
|
|
import fs from "fs";
|
|
|
|
import path from "path";
|
|
|
|
import { WARCParser } from "warcio";
|
|
|
|
|
|
|
|
test("run warc and ensure pageinfo records contain the correct resources", async () => {
|
|
|
|
child_process.execSync(
|
2024-03-07 08:35:53 -08:00
|
|
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC",
|
2024-02-16 14:36:32 -08:00
|
|
|
);
|
|
|
|
|
|
|
|
const filename = path.join(
|
|
|
|
"test-crawls",
|
|
|
|
"collections",
|
|
|
|
"page-info-test",
|
|
|
|
"page-info-test_0.warc.gz",
|
|
|
|
);
|
|
|
|
|
|
|
|
const nodeStream = fs.createReadStream(filename);
|
|
|
|
|
|
|
|
const parser = new WARCParser(nodeStream);
|
|
|
|
|
|
|
|
let foundIndex = false;
|
|
|
|
let foundAbout = false;
|
2024-03-07 08:35:53 -08:00
|
|
|
let foundInvalid = false;
|
2024-02-16 14:36:32 -08:00
|
|
|
|
|
|
|
for await (const record of parser) {
|
|
|
|
if (
|
|
|
|
!foundIndex &&
|
|
|
|
record.warcTargetURI === "urn:pageinfo:https://webrecorder.net/"
|
|
|
|
) {
|
|
|
|
foundIndex = true;
|
|
|
|
const text = await record.contentText();
|
|
|
|
validateResourcesIndex(JSON.parse(text));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (
|
|
|
|
!foundAbout &&
|
|
|
|
record.warcTargetURI === "urn:pageinfo:https://webrecorder.net/about"
|
|
|
|
) {
|
|
|
|
foundAbout = true;
|
|
|
|
const text = await record.contentText();
|
|
|
|
validateResourcesAbout(JSON.parse(text));
|
|
|
|
}
|
2024-03-07 08:35:53 -08:00
|
|
|
|
|
|
|
if (
|
|
|
|
!foundInvalid &&
|
|
|
|
record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/"
|
|
|
|
) {
|
|
|
|
foundInvalid = true;
|
|
|
|
const text = await record.contentText();
|
|
|
|
validateResourcesInvalid(JSON.parse(text));
|
|
|
|
}
|
2024-02-16 14:36:32 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
expect(foundIndex).toBe(true);
|
|
|
|
expect(foundAbout).toBe(true);
|
2024-03-07 08:35:53 -08:00
|
|
|
expect(foundInvalid).toBe(true);
|
2024-02-16 14:36:32 -08:00
|
|
|
});
|
|
|
|
|
|
|
|
function validateResourcesIndex(json) {
|
|
|
|
expect(json).toHaveProperty("pageid");
|
|
|
|
expect(json).toHaveProperty("url");
|
|
|
|
expect(json).toHaveProperty("ts");
|
|
|
|
expect(json).toHaveProperty("urls");
|
2024-02-21 16:02:25 -08:00
|
|
|
expect(json.counts).toEqual({ jsErrors: 0 });
|
2024-02-16 14:36:32 -08:00
|
|
|
expect(json.urls).toEqual({
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://webrecorder.net/": {
|
|
|
|
status: 200,
|
|
|
|
mime: "text/html",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "document",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/fontawesome/all.css": {
|
|
|
|
status: 200,
|
|
|
|
mime: "text/css",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "stylesheet",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/wr-logo.svg": {
|
|
|
|
status: 200,
|
|
|
|
mime: "image/svg+xml",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/tools/awp-icon.png": {
|
|
|
|
status: 200,
|
|
|
|
mime: "image/png",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/tools/logo-pywb.png": {
|
|
|
|
status: 200,
|
|
|
|
mime: "image/png",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/tools/browsertrixcrawler.png": {
|
|
|
|
status: 200,
|
|
|
|
mime: "image/png",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/tools/rwp-icon.png": {
|
|
|
|
status: 200,
|
|
|
|
mime: "image/png",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/images/btrix-cloud.png": {
|
|
|
|
status: 200,
|
|
|
|
mime: "image/png",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/main.css": {
|
|
|
|
status: 200,
|
|
|
|
mime: "text/css",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "stylesheet",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "text/css", type: "stylesheet" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "text/css", type: "stylesheet" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://stats.browsertrix.com/js/script.js": {
|
|
|
|
status: 200,
|
|
|
|
mime: "application/javascript",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "script",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "font/woff2", type: "font" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "font/woff2", type: "font" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://webrecorder.net/assets/favicon.ico": {
|
|
|
|
status: 200,
|
|
|
|
mime: "image/vnd.microsoft.icon",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "other",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 202, mime: "text/plain", type: "xhr" },
|
2024-02-16 14:36:32 -08:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
function validateResourcesAbout(json) {
|
|
|
|
expect(json).toHaveProperty("pageid");
|
|
|
|
expect(json).toHaveProperty("url");
|
|
|
|
expect(json).toHaveProperty("ts");
|
|
|
|
expect(json).toHaveProperty("urls");
|
2024-02-21 16:02:25 -08:00
|
|
|
expect(json.counts).toEqual({ jsErrors: 0 });
|
2024-02-16 14:36:32 -08:00
|
|
|
expect(json.urls).toEqual({
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://webrecorder.net/about": {
|
|
|
|
status: 200,
|
|
|
|
mime: "text/html",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "document",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/main.css": {
|
|
|
|
status: 200,
|
|
|
|
mime: "text/css",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "stylesheet",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/fontawesome/all.css": {
|
|
|
|
status: 200,
|
|
|
|
mime: "text/css",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "stylesheet",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "text/css", type: "stylesheet" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "text/css", type: "stylesheet" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://stats.browsertrix.com/js/script.js": {
|
|
|
|
status: 200,
|
|
|
|
mime: "application/javascript",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "script",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://webrecorder.net/assets/wr-logo.svg": {
|
|
|
|
status: 200,
|
|
|
|
mime: "image/svg+xml",
|
2024-03-04 18:10:45 -08:00
|
|
|
type: "image",
|
2024-02-19 19:11:48 -08:00
|
|
|
},
|
|
|
|
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "font/woff2", type: "font" },
|
2024-02-19 19:11:48 -08:00
|
|
|
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
2024-03-04 18:10:45 -08:00
|
|
|
{ status: 200, mime: "font/woff2", type: "font" },
|
2024-03-07 08:35:53 -08:00
|
|
|
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net":
|
|
|
|
{
|
|
|
|
status: 0,
|
|
|
|
type: "xhr",
|
|
|
|
error: "net::ERR_BLOCKED_BY_CLIENT",
|
|
|
|
},
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
function validateResourcesInvalid(json) {
|
|
|
|
expect(json).toHaveProperty("pageid");
|
|
|
|
expect(json).toHaveProperty("url");
|
|
|
|
expect(json).toHaveProperty("urls");
|
|
|
|
expect(json.counts).toEqual({ jsErrors: 0 });
|
|
|
|
expect(json.urls).toEqual({
|
|
|
|
"https://invalid.invalid/": {
|
|
|
|
status: 0,
|
|
|
|
type: "document",
|
|
|
|
error: "net::ERR_NAME_NOT_RESOLVED",
|
|
|
|
},
|
2024-02-16 14:36:32 -08:00
|
|
|
});
|
|
|
|
}
|