Separate writing pages to pages.jsonl + extraPages.jsonl to use with new py-wacz (#535)

Cherry-picked from the use-js-wacz branch, now implementing separate
writing of pages.jsonl / extraPages.jsonl to be used with py-wacz and
new `--copy-page-files` flag.

Dependent on py-wacz 0.5.0 (via webrecorder/py-wacz#43)

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-04-11 13:55:52 -07:00 committed by GitHub
parent c247189474
commit b5f3238c29
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 121 additions and 32 deletions

View file

@ -1 +1 @@
wacz>=0.4.9
wacz>=0.5.0

View file

@ -2,7 +2,7 @@ import child_process, { ChildProcess, StdioOptions } from "child_process";
import path from "path";
import fs, { WriteStream } from "fs";
import os from "os";
import fsp, { FileHandle } from "fs/promises";
import fsp from "fs/promises";
import {
RedisCrawlState,
@ -120,7 +120,8 @@ export class Crawler {
crawlState!: RedisCrawlState;
pagesFH?: FileHandle | null = null;
pagesFH?: WriteStream | null = null;
extraPagesFH?: WriteStream | null = null;
logFH!: WriteStream;
crawlId: string;
@ -146,7 +147,8 @@ export class Crawler {
gotoOpts: Record<string, any>;
pagesDir: string;
pagesFile: string;
seedPagesFile: string;
otherPagesFile: string;
archivesDir: string;
tempdir: string;
@ -270,7 +272,8 @@ export class Crawler {
this.pagesDir = path.join(this.collDir, "pages");
// pages file
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
this.seedPagesFile = path.join(this.pagesDir, "pages.jsonl");
this.otherPagesFile = path.join(this.pagesDir, "extraPages.jsonl");
// archives dir
this.archivesDir = path.join(this.collDir, "archive");
@ -1278,7 +1281,11 @@ self.__bx_behaviors.selectMainBehavior();
await this.crawlState.setStatus("running");
await this.initPages();
this.pagesFH = await this.initPages(this.seedPagesFile, "Seed Pages");
this.extraPagesFH = await this.initPages(
this.otherPagesFile,
"Non-Seed Pages",
);
this.adBlockRules = new AdBlockRules(
this.captureBasePrefix,
@ -1332,10 +1339,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.serializeConfig(true);
if (this.pagesFH) {
await this.pagesFH.sync();
await this.pagesFH.close();
}
await this.closePages();
await this.closeFiles();
@ -1349,6 +1353,32 @@ self.__bx_behaviors.selectMainBehavior();
await this.postCrawl();
}
async closePages() {
if (this.pagesFH) {
try {
await new Promise<void>((resolve) =>
this.pagesFH!.close(() => resolve()),
);
} catch (e) {
// ignore
} finally {
this.pagesFH = null;
}
}
if (this.extraPagesFH) {
try {
await new Promise<void>((resolve) =>
this.extraPagesFH!.close(() => resolve()),
);
} catch (e) {
// ignore
} finally {
this.extraPagesFH = null;
}
}
}
async closeFiles() {
if (this.textWriter) {
await this.textWriter.flush();
@ -1490,11 +1520,13 @@ self.__bx_behaviors.selectMainBehavior();
const createArgs = [
"create",
"--split-seeds",
"-o",
waczPath,
"--pages",
this.pagesFile,
this.seedPagesFile,
"--extra-pages",
this.otherPagesFile,
"--copy-pages",
"--log-directory",
this.logDir,
];
@ -2027,36 +2059,34 @@ self.__bx_behaviors.selectMainBehavior();
return false;
}
async initPages() {
async initPages(filename: string, title: string) {
let fh = null;
try {
let createNew = false;
await fsp.mkdir(this.pagesDir, { recursive: true });
// create pages dir if doesn't exist and write pages.jsonl header
if (fs.existsSync(this.pagesDir) != true) {
await fsp.mkdir(this.pagesDir);
createNew = true;
}
const createNew = !fs.existsSync(filename);
this.pagesFH = await fsp.open(this.pagesFile, "a");
fh = fs.createWriteStream(filename, { flags: "a" });
if (createNew) {
const header: Record<string, string> = {
format: "json-pages-1.0",
id: "pages",
title: "All Pages",
title,
};
header["hasText"] = String(this.textInPages);
header.hasText = this.params.text.includes("to-pages");
if (this.params.text.length) {
logger.debug("Text Extraction: " + this.params.text.join(","));
} else {
logger.debug("Text Extraction: None");
}
const header_formatted = JSON.stringify(header).concat("\n");
await this.pagesFH.writeFile(header_formatted);
await fh.write(JSON.stringify(header) + "\n");
}
} catch (err) {
logger.error("pages/pages.jsonl creation failed", err);
logger.error(`"${filename}" creation failed`, err);
}
return fh;
}
protected pageEntryForRedis(
@ -2085,7 +2115,11 @@ self.__bx_behaviors.selectMainBehavior();
let { ts } = state;
if (!ts) {
ts = new Date();
logger.warn("Page date missing, setting to now", { url, ts });
logger.warn(
"Page date missing, setting to now",
{ url, ts },
"pageStatus",
);
}
row.ts = ts.toISOString();
@ -2117,10 +2151,22 @@ self.__bx_behaviors.selectMainBehavior();
}
const processedRow = JSON.stringify(row) + "\n";
const pagesFH = depth > 0 ? this.extraPagesFH : this.pagesFH;
if (!pagesFH) {
logger.error("Can't write pages, missing stream", {}, "pageStatus");
return;
}
try {
await this.pagesFH!.writeFile(processedRow);
await pagesFH.write(processedRow);
} catch (err) {
logger.warn("pages/pages.jsonl append failed", err);
logger.warn(
"Page append failed",
{ pagesFile: depth > 0 ? this.otherPagesFile : this.seedPagesFile },
"pageStatus",
);
}
}

View file

@ -24,13 +24,29 @@ test("ensure custom driver with custom selector crawls JS files as pages", async
pages.add(url);
}
console.log(pages);
const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/custom-driver-1/pages/extraPages.jsonl",
"utf8",
);
const extraPages = new Set();
for (const line of crawledExtraPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
extraPages.add(url);
}
const expectedPages = new Set([
"https://www.iana.org/",
]);
const expectedExtraPages = new Set([
"https://www.iana.org/_js/jquery.js",
"https://www.iana.org/_js/iana.js",
]);
expect(pages).toEqual(expectedPages);
expect(extraPages).toEqual(expectedExtraPages);
});

View file

@ -24,8 +24,17 @@ test(
);
const crawledPagesArray = crawledPages.trim().split("\n");
const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/extra-hops-beyond/pages/extraPages.jsonl",
"utf8",
);
const crawledExtraPagesArray = crawledExtraPages.trim().split("\n");
const expectedPages = [
"https://webrecorder.net/",
];
const expectedExtraPages = [
"https://webrecorder.net/blog",
"https://webrecorder.net/tools",
"https://webrecorder.net/community",
@ -36,6 +45,7 @@ test(
// first line is the header, not page, so adding -1
expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
expect(crawledExtraPagesArray.length - 1).toEqual(expectedExtraPages.length);
for (const page of crawledPagesArray) {
const url = JSON.parse(page).url;
@ -44,6 +54,14 @@ test(
}
expect(expectedPages.indexOf(url) >= 0).toBe(true);
}
for (const page of crawledExtraPagesArray) {
const url = JSON.parse(page).url;
if (!url) {
continue;
}
expect(expectedExtraPages.indexOf(url) >= 0).toBe(true);
}
},
extraHopsTimeout,
);

View file

@ -6,6 +6,7 @@ import Redis from "ioredis";
const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";
const extraPagesFile = "test-crawls/collections/int-state-test/pages/extraPages.jsonl";
function sleep(ms) {
@ -159,12 +160,20 @@ test("check crawl restarted with saved state", async () => {
}
});
test("ensure correct number of pages was written", () => {
test("ensure correct number of pages was written to pages + extraPages", () => {
const pages = fs
.readFileSync(pagesFile, { encoding: "utf-8" })
.trim()
.split("\n");
// first line is the header
expect(pages.length).toBe(10 + 1);
expect(pages.length).toBe(2);
const extraPages = fs
.readFileSync(extraPagesFile, { encoding: "utf-8" })
.trim()
.split("\n");
// first line is the header
expect(extraPages.length).toBe(10);
});

View file

@ -64,7 +64,7 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh
}
test("test sitemap fully finish", async () => {
await runCrawl(8036, "https://www.mozilla.org/", "", 0);
await runCrawl(7000, "https://www.mozilla.org/", "", 0);
});
test("test sitemap with limit", async () => {