Add option to write JSONL file with data on skipped pages (#966)

Fixes #965 

Add `--reportSkipped` argument, which will enable the creation of a
`reports/skippedPages.jsonl` file with the following elements for each
URL encountered that was not queued:

- `url`
- `seedUrl`
- `depth`
- `reason` (one of `outOfScope`, `pageLimit`, `robotsTxt`, or
`redirectToExcluded`)
- `ts`

The `reports/` directory is new and will likely be expanded with other
crawl-time reporting moving forward.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2026-04-09 15:51:41 -04:00 committed by GitHub
parent 64fdaf0d11
commit 1c6e814e15
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 308 additions and 4 deletions

View file

@ -33,4 +33,12 @@ module.exports = {
"@typescript-eslint/await-thenable": "error",
},
reportUnusedDisableDirectives: true,
overrides: [
{
"files": ["tests/*.ts"],
"rules": {
"@typescript-eslint/no-floating-promises": "off"
}
}
]
};

View file

@ -337,6 +337,10 @@ Options:
--robotsAgent Agent to check in addition to '*' fo
r robots rules
[string] [default: "Browsertrix/1.x"]
--reportSkipped If set, write information about URLs
encountered but not queued to repor
ts/skippedPages.jsonl
[boolean] [default: false]
--config Path to YAML config file
```

View file

@ -46,6 +46,7 @@ import {
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
ExitCodes,
InterruptReason,
SkippedReason,
BxFunctionBindings,
MAX_JS_DIALOG_PER_PAGE,
CrawlStatus,
@ -119,6 +120,7 @@ export class Crawler {
pagesFH?: WriteStream | null = null;
extraPagesFH?: WriteStream | null = null;
skippedPagesFH?: WriteStream | null = null;
crawlId: string;
@ -149,6 +151,9 @@ export class Crawler {
seedPagesFile: string;
otherPagesFile: string;
reportsDir: string;
skippedPagesFile: string;
archivesDir: string;
warcCdxDir: string;
indexesDir: string;
@ -284,6 +289,12 @@ export class Crawler {
this.seedPagesFile = path.join(this.pagesDir, "pages.jsonl");
this.otherPagesFile = path.join(this.pagesDir, "extraPages.jsonl");
// reports directory
this.reportsDir = path.join(this.collDir, "reports");
// reports files
this.skippedPagesFile = path.join(this.reportsDir, "skippedPages.jsonl");
// archives dir
this.archivesDir = path.join(this.collDir, "archive");
@ -787,7 +798,13 @@ export class Crawler {
seedId,
);
return !!seed.isIncluded(url, depth, extraHops, logDetails);
const res = seed.isIncluded(url, depth, extraHops, logDetails);
if (!res) {
this.writeSkippedPage(url, seedId, depth, SkippedReason.OutOfScope);
}
return !!res;
}
async setupPage(opts: WorkerState) {
@ -1349,6 +1366,13 @@ self.__bx_behaviors.selectMainBehavior();
} else {
if (pageSkipped) {
await this.crawlState.markExcluded(url);
this.writeSkippedPage(
url,
data.seedId,
depth,
SkippedReason.RedirectToExcluded,
);
this.limitHit = false;
} else {
const retry = await this.crawlState.markFailed(url, noRetries);
@ -1822,6 +1846,13 @@ self.__bx_behaviors.selectMainBehavior();
this.otherPagesFile,
"Non-Seed Pages",
);
if (this.params.reportSkipped) {
this.skippedPagesFH = await this.initPages(
this.skippedPagesFile,
"Skipped Pages",
true,
);
}
this.adBlockRules = new AdBlockRules(
this.captureBasePrefix,
@ -1927,6 +1958,18 @@ self.__bx_behaviors.selectMainBehavior();
this.extraPagesFH = null;
}
}
if (this.skippedPagesFH) {
try {
await new Promise<void>((resolve) =>
this.skippedPagesFH!.close(() => resolve()),
);
} catch (e) {
// ignore
} finally {
this.skippedPagesFH = null;
}
}
}
async closeFiles() {
@ -2107,6 +2150,10 @@ self.__bx_behaviors.selectMainBehavior();
}
}
if (this.params.reportSkipped) {
waczOpts.reportsDir = this.reportsDir;
}
if (this.params.title) {
waczOpts.title = this.params.title;
}
@ -2587,6 +2634,12 @@ self.__bx_behaviors.selectMainBehavior();
);
if (!res) {
this.writeSkippedPage(
possibleUrl,
seedId,
depth,
SkippedReason.OutOfScope,
);
continue;
}
@ -2642,6 +2695,12 @@ self.__bx_behaviors.selectMainBehavior();
pageid?: string,
) {
if (this.limitHit) {
logger.debug(
"Page URL not queued, at page limit",
{ url, ...logDetails },
"links",
);
this.writeSkippedPage(url, seedId, depth, SkippedReason.PageLimit);
return false;
}
@ -2654,6 +2713,7 @@ self.__bx_behaviors.selectMainBehavior();
{ url, ...logDetails },
"links",
);
this.writeSkippedPage(url, seedId, depth, SkippedReason.RobotsTxt);
return false;
}
@ -2679,6 +2739,7 @@ self.__bx_behaviors.selectMainBehavior();
);
}
this.limitHit = true;
this.writeSkippedPage(url, seedId, depth, SkippedReason.PageLimit);
return false;
case QueueState.DUPE_URL:
@ -2693,11 +2754,13 @@ self.__bx_behaviors.selectMainBehavior();
return false;
}
async initPages(filename: string, title: string) {
async initPages(filename: string, title: string, isReport: boolean = false) {
let fh = null;
try {
await fsp.mkdir(this.pagesDir, { recursive: true });
await fsp.mkdir(isReport ? this.reportsDir : this.pagesDir, {
recursive: true,
});
const createNew = !fs.existsSync(filename);
@ -2810,6 +2873,48 @@ self.__bx_behaviors.selectMainBehavior();
}
}
writeSkippedPage(
url: string,
seedId: number,
depth: number,
reason: SkippedReason,
) {
if (!this.params.reportSkipped) {
return;
}
const seedUrl = this.seeds[seedId]?.url || "";
const ts = new Date();
let seed = false;
if (depth === 0) {
seed = true;
}
const row = { url, seedUrl, depth, seed, reason, ts: ts.toISOString() };
const processedRow = JSON.stringify(row) + "\n";
if (!this.skippedPagesFH) {
logger.error(
"Can't write skipped pages, missing stream",
{},
"pageStatus",
);
return;
}
try {
this.skippedPagesFH.write(processedRow);
} catch (err) {
logger.warn(
"Page append failed",
{ pagesFile: this.skippedPagesFile },
"pageStatus",
);
}
}
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
if (!sitemap) {
return;

View file

@ -736,6 +736,13 @@ class ArgParser {
type: "string",
default: "Browsertrix/1.x",
},
reportSkipped: {
describe:
"If set, write information about URLs encountered but not queued to reports/skippedPages.jsonl",
type: "boolean",
default: false,
},
});
}

View file

@ -117,3 +117,10 @@ export type CrawlStatus =
| "canceled";
export const WARC_REFERS_TO_CONTAINER = "WARC-Refers-To-Container";
export enum SkippedReason {
OutOfScope = "outOfScope",
PageLimit = "pageLimit",
RobotsTxt = "robotsTxt",
RedirectToExcluded = "redirectToExcluded",
}

View file

@ -39,6 +39,7 @@ export type WACZInitOpts = {
warcCdxDir: string;
indexesDir: string;
logDirectory: string;
reportsDir?: string;
softwareString: string;
@ -102,6 +103,7 @@ export class WACZ {
logsDir: string;
warcCdxDir: string;
indexesDir: string;
reportsDir: string | null;
datapackage: WACZDataPackage;
@ -119,6 +121,7 @@ export class WACZ {
this.warcCdxDir = config.warcCdxDir;
this.collDir = collDir;
this.indexesDir = config.indexesDir;
this.reportsDir = config.reportsDir || null;
this.datapackage = {
resources: [],
@ -144,13 +147,17 @@ export class WACZ {
}
generate(): Readable {
const files = [
const baseFiles = [
...this.warcs,
...addDirFiles(this.indexesDir),
...addDirFiles(this.pagesDir),
...addDirFiles(this.logsDir),
];
const files = this.reportsDir
? [...baseFiles, ...addDirFiles(this.reportsDir)]
: baseFiles;
const zip = makeZip(
this.iterDirForZip(files),
) as ReadableStream<Uint8Array>;

166
tests/skipped_pages.test.ts Normal file
View file

@ -0,0 +1,166 @@
import child_process from "child_process";
import fs from "fs";
import md5 from "md5";
//const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
//const testIf = (condition: string, ...args: Parameters<typeof test>) => condition ? test(...args) : test.skip(...args);
test("ensure basic crawl run with docker run passes with reportSkipped option, out of scope pages", async () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --generateWACZ --collection skipped-pages --workers 2 --reportSkipped --limit 5",
);
child_process.execSync(
"unzip test-crawls/collections/skipped-pages/skipped-pages.wacz -d test-crawls/collections/skipped-pages/wacz",
);
});
// testIf(doValidate, "validate wacz with skippedPages.jsonl", () => {
// child_process.execSync(
// "wacz validate --file ./test-crawls/collections/skipped-pages/skipped-pages.wacz",
// );
// });
test("ensure skippedPages.jsonl was written as expected, contains outOfScope page", () => {
const skippedPages = fs.readFileSync(
"test-crawls/collections/skipped-pages/reports/skippedPages.jsonl",
"utf8",
);
let pageCount = 0;
for (const line of skippedPages.trim().split("\n")) {
const data = JSON.parse(line);
if (data.format) {
continue;
}
pageCount++;
expect(data).toHaveProperty("url");
expect(data).toHaveProperty("seedUrl");
expect(data).toHaveProperty("depth");
expect(data).toHaveProperty("reason");
expect(data.reason).toBe("outOfScope");
expect(data).toHaveProperty("ts");
}
expect(pageCount > 0).toBe(true);
});
test("ensure skippedPages.jsonl was written to wacz", () => {
const crawlHash = md5(
fs.readFileSync(
"test-crawls/collections/skipped-pages/reports/skippedPages.jsonl",
"utf8",
),
);
const waczHash = md5(
fs.readFileSync(
"test-crawls/collections/skipped-pages/wacz/reports/skippedPages.jsonl",
"utf8",
),
);
expect(crawlHash).toEqual(waczHash);
});
test("check that skippedPages.jsonl file made it into WACZ datapackage.json", () => {
expect(
fs.existsSync(
"test-crawls/collections/skipped-pages/wacz/datapackage.json",
),
).toBe(true);
const data = fs.readFileSync(
"test-crawls/collections/skipped-pages/wacz/datapackage.json",
"utf8",
);
let found = false;
const dataPackageJSON = JSON.parse(data);
const resources = dataPackageJSON.resources;
for (let i = 0; i < resources.length; i++) {
const res = resources[i];
if (res.path == "reports/skippedPages.jsonl" && res.bytes > 0) {
found = true;
}
}
expect(found).toBe(true);
});
test("ensure basic crawl run with docker run passes with reportSkipped option, pageLimit report", async () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --extraHops 1 --reportSkipped --collection skipped-pages-2 --limit 1",
);
});
test("ensure skippedPages.jsonl was written as expected, contains pageLimit page", () => {
const skippedPages = fs.readFileSync(
"test-crawls/collections/skipped-pages-2/reports/skippedPages.jsonl",
"utf8",
);
let pageCount = 0;
for (const line of skippedPages.trim().split("\n")) {
const data = JSON.parse(line);
if (data.format) {
continue;
}
pageCount++;
expect(data).toHaveProperty("url");
expect(data).toHaveProperty("seedUrl");
expect(data).toHaveProperty("depth");
expect(data).toHaveProperty("reason");
expect(data.reason).toBe("pageLimit");
expect(data).toHaveProperty("ts");
}
expect(pageCount > 0).toBe(true);
});
test("redirect to excluded page, crawl fails as no seeds crawled", () => {
let failed = false;
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://old.webrecorder.net/ --exclude https://old.webrecorder.net/ --reportSkipped --collection skipped-pages-3",
);
} catch (e) {
failed = true;
}
expect(failed).toBe(true);
});
test("ensure skippedPages.jsonl was written as expected, contains redirectToExcluded page", () => {
const skippedPages = fs.readFileSync(
"test-crawls/collections/skipped-pages-3/reports/skippedPages.jsonl",
"utf8",
);
let pageCount = 0;
for (const line of skippedPages.trim().split("\n")) {
const data = JSON.parse(line);
if (data.format) {
continue;
}
pageCount++;
expect(data).toHaveProperty("url");
expect(data).toHaveProperty("seedUrl");
expect(data).toHaveProperty("depth");
expect(data).toHaveProperty("reason");
expect(data.reason).toBe("redirectToExcluded");
expect(data).toHaveProperty("ts");
}
expect(pageCount > 0).toBe(true);
});