mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Separate writing pages to pages.jsonl + extraPages.jsonl to use with new py-wacz (#535)
Cherry-picked from the use-js-wacz branch, now implementing separate writing of pages.jsonl / extraPages.jsonl to be used with py-wacz and new `--copy-page-files` flag. Dependent on py-wacz 0.5.0 (via webrecorder/py-wacz#43) --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
c247189474
commit
b5f3238c29
6 changed files with 121 additions and 32 deletions
|
@ -1 +1 @@
|
|||
wacz>=0.4.9
|
||||
wacz>=0.5.0
|
||||
|
|
100
src/crawler.ts
100
src/crawler.ts
|
@ -2,7 +2,7 @@ import child_process, { ChildProcess, StdioOptions } from "child_process";
|
|||
import path from "path";
|
||||
import fs, { WriteStream } from "fs";
|
||||
import os from "os";
|
||||
import fsp, { FileHandle } from "fs/promises";
|
||||
import fsp from "fs/promises";
|
||||
|
||||
import {
|
||||
RedisCrawlState,
|
||||
|
@ -120,7 +120,8 @@ export class Crawler {
|
|||
|
||||
crawlState!: RedisCrawlState;
|
||||
|
||||
pagesFH?: FileHandle | null = null;
|
||||
pagesFH?: WriteStream | null = null;
|
||||
extraPagesFH?: WriteStream | null = null;
|
||||
logFH!: WriteStream;
|
||||
|
||||
crawlId: string;
|
||||
|
@ -146,7 +147,8 @@ export class Crawler {
|
|||
gotoOpts: Record<string, any>;
|
||||
|
||||
pagesDir: string;
|
||||
pagesFile: string;
|
||||
seedPagesFile: string;
|
||||
otherPagesFile: string;
|
||||
|
||||
archivesDir: string;
|
||||
tempdir: string;
|
||||
|
@ -270,7 +272,8 @@ export class Crawler {
|
|||
this.pagesDir = path.join(this.collDir, "pages");
|
||||
|
||||
// pages file
|
||||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
this.seedPagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
this.otherPagesFile = path.join(this.pagesDir, "extraPages.jsonl");
|
||||
|
||||
// archives dir
|
||||
this.archivesDir = path.join(this.collDir, "archive");
|
||||
|
@ -1278,7 +1281,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
await this.crawlState.setStatus("running");
|
||||
|
||||
await this.initPages();
|
||||
this.pagesFH = await this.initPages(this.seedPagesFile, "Seed Pages");
|
||||
this.extraPagesFH = await this.initPages(
|
||||
this.otherPagesFile,
|
||||
"Non-Seed Pages",
|
||||
);
|
||||
|
||||
this.adBlockRules = new AdBlockRules(
|
||||
this.captureBasePrefix,
|
||||
|
@ -1332,10 +1339,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
await this.serializeConfig(true);
|
||||
|
||||
if (this.pagesFH) {
|
||||
await this.pagesFH.sync();
|
||||
await this.pagesFH.close();
|
||||
}
|
||||
await this.closePages();
|
||||
|
||||
await this.closeFiles();
|
||||
|
||||
|
@ -1349,6 +1353,32 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.postCrawl();
|
||||
}
|
||||
|
||||
async closePages() {
|
||||
if (this.pagesFH) {
|
||||
try {
|
||||
await new Promise<void>((resolve) =>
|
||||
this.pagesFH!.close(() => resolve()),
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
} finally {
|
||||
this.pagesFH = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.extraPagesFH) {
|
||||
try {
|
||||
await new Promise<void>((resolve) =>
|
||||
this.extraPagesFH!.close(() => resolve()),
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
} finally {
|
||||
this.extraPagesFH = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async closeFiles() {
|
||||
if (this.textWriter) {
|
||||
await this.textWriter.flush();
|
||||
|
@ -1490,11 +1520,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const createArgs = [
|
||||
"create",
|
||||
"--split-seeds",
|
||||
"-o",
|
||||
waczPath,
|
||||
"--pages",
|
||||
this.pagesFile,
|
||||
this.seedPagesFile,
|
||||
"--extra-pages",
|
||||
this.otherPagesFile,
|
||||
"--copy-pages",
|
||||
"--log-directory",
|
||||
this.logDir,
|
||||
];
|
||||
|
@ -2027,36 +2059,34 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return false;
|
||||
}
|
||||
|
||||
async initPages() {
|
||||
async initPages(filename: string, title: string) {
|
||||
let fh = null;
|
||||
|
||||
try {
|
||||
let createNew = false;
|
||||
await fsp.mkdir(this.pagesDir, { recursive: true });
|
||||
|
||||
// create pages dir if doesn't exist and write pages.jsonl header
|
||||
if (fs.existsSync(this.pagesDir) != true) {
|
||||
await fsp.mkdir(this.pagesDir);
|
||||
createNew = true;
|
||||
}
|
||||
const createNew = !fs.existsSync(filename);
|
||||
|
||||
this.pagesFH = await fsp.open(this.pagesFile, "a");
|
||||
fh = fs.createWriteStream(filename, { flags: "a" });
|
||||
|
||||
if (createNew) {
|
||||
const header: Record<string, string> = {
|
||||
format: "json-pages-1.0",
|
||||
id: "pages",
|
||||
title: "All Pages",
|
||||
title,
|
||||
};
|
||||
header["hasText"] = String(this.textInPages);
|
||||
header.hasText = this.params.text.includes("to-pages");
|
||||
if (this.params.text.length) {
|
||||
logger.debug("Text Extraction: " + this.params.text.join(","));
|
||||
} else {
|
||||
logger.debug("Text Extraction: None");
|
||||
}
|
||||
const header_formatted = JSON.stringify(header).concat("\n");
|
||||
await this.pagesFH.writeFile(header_formatted);
|
||||
await fh.write(JSON.stringify(header) + "\n");
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error("pages/pages.jsonl creation failed", err);
|
||||
logger.error(`"${filename}" creation failed`, err);
|
||||
}
|
||||
return fh;
|
||||
}
|
||||
|
||||
protected pageEntryForRedis(
|
||||
|
@ -2085,7 +2115,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
let { ts } = state;
|
||||
if (!ts) {
|
||||
ts = new Date();
|
||||
logger.warn("Page date missing, setting to now", { url, ts });
|
||||
logger.warn(
|
||||
"Page date missing, setting to now",
|
||||
{ url, ts },
|
||||
"pageStatus",
|
||||
);
|
||||
}
|
||||
|
||||
row.ts = ts.toISOString();
|
||||
|
@ -2117,10 +2151,22 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
const processedRow = JSON.stringify(row) + "\n";
|
||||
|
||||
const pagesFH = depth > 0 ? this.extraPagesFH : this.pagesFH;
|
||||
|
||||
if (!pagesFH) {
|
||||
logger.error("Can't write pages, missing stream", {}, "pageStatus");
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.pagesFH!.writeFile(processedRow);
|
||||
await pagesFH.write(processedRow);
|
||||
} catch (err) {
|
||||
logger.warn("pages/pages.jsonl append failed", err);
|
||||
logger.warn(
|
||||
"Page append failed",
|
||||
{ pagesFile: depth > 0 ? this.otherPagesFile : this.seedPagesFile },
|
||||
"pageStatus",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,13 +24,29 @@ test("ensure custom driver with custom selector crawls JS files as pages", async
|
|||
pages.add(url);
|
||||
}
|
||||
|
||||
console.log(pages);
|
||||
const crawledExtraPages = fs.readFileSync(
|
||||
"test-crawls/collections/custom-driver-1/pages/extraPages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const extraPages = new Set();
|
||||
|
||||
for (const line of crawledExtraPages.trim().split("\n")) {
|
||||
const url = JSON.parse(line).url;
|
||||
if (!url) {
|
||||
continue;
|
||||
}
|
||||
extraPages.add(url);
|
||||
}
|
||||
|
||||
const expectedPages = new Set([
|
||||
"https://www.iana.org/",
|
||||
]);
|
||||
|
||||
const expectedExtraPages = new Set([
|
||||
"https://www.iana.org/_js/jquery.js",
|
||||
"https://www.iana.org/_js/iana.js",
|
||||
]);
|
||||
|
||||
expect(pages).toEqual(expectedPages);
|
||||
expect(extraPages).toEqual(expectedExtraPages);
|
||||
});
|
||||
|
|
|
@ -24,8 +24,17 @@ test(
|
|||
);
|
||||
const crawledPagesArray = crawledPages.trim().split("\n");
|
||||
|
||||
const crawledExtraPages = fs.readFileSync(
|
||||
"test-crawls/collections/extra-hops-beyond/pages/extraPages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const crawledExtraPagesArray = crawledExtraPages.trim().split("\n");
|
||||
|
||||
const expectedPages = [
|
||||
"https://webrecorder.net/",
|
||||
];
|
||||
|
||||
const expectedExtraPages = [
|
||||
"https://webrecorder.net/blog",
|
||||
"https://webrecorder.net/tools",
|
||||
"https://webrecorder.net/community",
|
||||
|
@ -36,6 +45,7 @@ test(
|
|||
|
||||
// first line is the header, not page, so adding -1
|
||||
expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
|
||||
expect(crawledExtraPagesArray.length - 1).toEqual(expectedExtraPages.length);
|
||||
|
||||
for (const page of crawledPagesArray) {
|
||||
const url = JSON.parse(page).url;
|
||||
|
@ -44,6 +54,14 @@ test(
|
|||
}
|
||||
expect(expectedPages.indexOf(url) >= 0).toBe(true);
|
||||
}
|
||||
|
||||
for (const page of crawledExtraPagesArray) {
|
||||
const url = JSON.parse(page).url;
|
||||
if (!url) {
|
||||
continue;
|
||||
}
|
||||
expect(expectedExtraPages.indexOf(url) >= 0).toBe(true);
|
||||
}
|
||||
},
|
||||
extraHopsTimeout,
|
||||
);
|
||||
|
|
|
@ -6,6 +6,7 @@ import Redis from "ioredis";
|
|||
|
||||
|
||||
const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";
|
||||
const extraPagesFile = "test-crawls/collections/int-state-test/pages/extraPages.jsonl";
|
||||
|
||||
|
||||
function sleep(ms) {
|
||||
|
@ -159,12 +160,20 @@ test("check crawl restarted with saved state", async () => {
|
|||
}
|
||||
});
|
||||
|
||||
test("ensure correct number of pages was written", () => {
|
||||
test("ensure correct number of pages was written to pages + extraPages", () => {
|
||||
const pages = fs
|
||||
.readFileSync(pagesFile, { encoding: "utf-8" })
|
||||
.trim()
|
||||
.split("\n");
|
||||
|
||||
// first line is the header
|
||||
expect(pages.length).toBe(10 + 1);
|
||||
expect(pages.length).toBe(2);
|
||||
|
||||
const extraPages = fs
|
||||
.readFileSync(extraPagesFile, { encoding: "utf-8" })
|
||||
.trim()
|
||||
.split("\n");
|
||||
|
||||
// first line is the header
|
||||
expect(extraPages.length).toBe(10);
|
||||
});
|
||||
|
|
|
@ -64,7 +64,7 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh
|
|||
}
|
||||
|
||||
test("test sitemap fully finish", async () => {
|
||||
await runCrawl(8036, "https://www.mozilla.org/", "", 0);
|
||||
await runCrawl(7000, "https://www.mozilla.org/", "", 0);
|
||||
});
|
||||
|
||||
test("test sitemap with limit", async () => {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue