mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2026-04-18 07:00:22 +00:00
Add option to write JSONL file with data on skipped pages (#966)
Fixes #965 Add `--reportSkipped` argument, which will enable the creation of a `reports/skippedPages.jsonl` file with the following elements for each URL encountered that was not queued: - `url` - `seedUrl` - `depth` - `reason` (one of `outOfScope`, `pageLimit`, `robotsTxt`, or `redirectToExcluded`) - `ts` The `reports/` directory is new and will likely be expanded with other crawl-time reporting moving forward. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
64fdaf0d11
commit
1c6e814e15
7 changed files with 308 additions and 4 deletions
|
|
@ -33,4 +33,12 @@ module.exports = {
|
|||
"@typescript-eslint/await-thenable": "error",
|
||||
},
|
||||
reportUnusedDisableDirectives: true,
|
||||
overrides: [
|
||||
{
|
||||
"files": ["tests/*.ts"],
|
||||
"rules": {
|
||||
"@typescript-eslint/no-floating-promises": "off"
|
||||
}
|
||||
}
|
||||
]
|
||||
};
|
||||
|
|
|
|||
|
|
@ -337,6 +337,10 @@ Options:
|
|||
--robotsAgent Agent to check in addition to '*' fo
|
||||
r robots rules
|
||||
[string] [default: "Browsertrix/1.x"]
|
||||
--reportSkipped If set, write information about URLs
|
||||
encountered but not queued to repor
|
||||
ts/skippedPages.jsonl
|
||||
[boolean] [default: false]
|
||||
--config Path to YAML config file
|
||||
```
|
||||
|
||||
|
|
|
|||
111
src/crawler.ts
111
src/crawler.ts
|
|
@ -46,6 +46,7 @@ import {
|
|||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||
ExitCodes,
|
||||
InterruptReason,
|
||||
SkippedReason,
|
||||
BxFunctionBindings,
|
||||
MAX_JS_DIALOG_PER_PAGE,
|
||||
CrawlStatus,
|
||||
|
|
@ -119,6 +120,7 @@ export class Crawler {
|
|||
|
||||
pagesFH?: WriteStream | null = null;
|
||||
extraPagesFH?: WriteStream | null = null;
|
||||
skippedPagesFH?: WriteStream | null = null;
|
||||
|
||||
crawlId: string;
|
||||
|
||||
|
|
@ -149,6 +151,9 @@ export class Crawler {
|
|||
seedPagesFile: string;
|
||||
otherPagesFile: string;
|
||||
|
||||
reportsDir: string;
|
||||
skippedPagesFile: string;
|
||||
|
||||
archivesDir: string;
|
||||
warcCdxDir: string;
|
||||
indexesDir: string;
|
||||
|
|
@ -284,6 +289,12 @@ export class Crawler {
|
|||
this.seedPagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
this.otherPagesFile = path.join(this.pagesDir, "extraPages.jsonl");
|
||||
|
||||
// reports directory
|
||||
this.reportsDir = path.join(this.collDir, "reports");
|
||||
|
||||
// reports files
|
||||
this.skippedPagesFile = path.join(this.reportsDir, "skippedPages.jsonl");
|
||||
|
||||
// archives dir
|
||||
this.archivesDir = path.join(this.collDir, "archive");
|
||||
|
||||
|
|
@ -787,7 +798,13 @@ export class Crawler {
|
|||
seedId,
|
||||
);
|
||||
|
||||
return !!seed.isIncluded(url, depth, extraHops, logDetails);
|
||||
const res = seed.isIncluded(url, depth, extraHops, logDetails);
|
||||
|
||||
if (!res) {
|
||||
this.writeSkippedPage(url, seedId, depth, SkippedReason.OutOfScope);
|
||||
}
|
||||
|
||||
return !!res;
|
||||
}
|
||||
|
||||
async setupPage(opts: WorkerState) {
|
||||
|
|
@ -1349,6 +1366,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
} else {
|
||||
if (pageSkipped) {
|
||||
await this.crawlState.markExcluded(url);
|
||||
|
||||
this.writeSkippedPage(
|
||||
url,
|
||||
data.seedId,
|
||||
depth,
|
||||
SkippedReason.RedirectToExcluded,
|
||||
);
|
||||
this.limitHit = false;
|
||||
} else {
|
||||
const retry = await this.crawlState.markFailed(url, noRetries);
|
||||
|
|
@ -1822,6 +1846,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
this.otherPagesFile,
|
||||
"Non-Seed Pages",
|
||||
);
|
||||
if (this.params.reportSkipped) {
|
||||
this.skippedPagesFH = await this.initPages(
|
||||
this.skippedPagesFile,
|
||||
"Skipped Pages",
|
||||
true,
|
||||
);
|
||||
}
|
||||
|
||||
this.adBlockRules = new AdBlockRules(
|
||||
this.captureBasePrefix,
|
||||
|
|
@ -1927,6 +1958,18 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
this.extraPagesFH = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.skippedPagesFH) {
|
||||
try {
|
||||
await new Promise<void>((resolve) =>
|
||||
this.skippedPagesFH!.close(() => resolve()),
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
} finally {
|
||||
this.skippedPagesFH = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async closeFiles() {
|
||||
|
|
@ -2107,6 +2150,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
if (this.params.reportSkipped) {
|
||||
waczOpts.reportsDir = this.reportsDir;
|
||||
}
|
||||
|
||||
if (this.params.title) {
|
||||
waczOpts.title = this.params.title;
|
||||
}
|
||||
|
|
@ -2587,6 +2634,12 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
);
|
||||
|
||||
if (!res) {
|
||||
this.writeSkippedPage(
|
||||
possibleUrl,
|
||||
seedId,
|
||||
depth,
|
||||
SkippedReason.OutOfScope,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -2642,6 +2695,12 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
pageid?: string,
|
||||
) {
|
||||
if (this.limitHit) {
|
||||
logger.debug(
|
||||
"Page URL not queued, at page limit",
|
||||
{ url, ...logDetails },
|
||||
"links",
|
||||
);
|
||||
this.writeSkippedPage(url, seedId, depth, SkippedReason.PageLimit);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2654,6 +2713,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
{ url, ...logDetails },
|
||||
"links",
|
||||
);
|
||||
this.writeSkippedPage(url, seedId, depth, SkippedReason.RobotsTxt);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2679,6 +2739,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
);
|
||||
}
|
||||
this.limitHit = true;
|
||||
this.writeSkippedPage(url, seedId, depth, SkippedReason.PageLimit);
|
||||
return false;
|
||||
|
||||
case QueueState.DUPE_URL:
|
||||
|
|
@ -2693,11 +2754,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return false;
|
||||
}
|
||||
|
||||
async initPages(filename: string, title: string) {
|
||||
async initPages(filename: string, title: string, isReport: boolean = false) {
|
||||
let fh = null;
|
||||
|
||||
try {
|
||||
await fsp.mkdir(this.pagesDir, { recursive: true });
|
||||
await fsp.mkdir(isReport ? this.reportsDir : this.pagesDir, {
|
||||
recursive: true,
|
||||
});
|
||||
|
||||
const createNew = !fs.existsSync(filename);
|
||||
|
||||
|
|
@ -2810,6 +2873,48 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
writeSkippedPage(
|
||||
url: string,
|
||||
seedId: number,
|
||||
depth: number,
|
||||
reason: SkippedReason,
|
||||
) {
|
||||
if (!this.params.reportSkipped) {
|
||||
return;
|
||||
}
|
||||
|
||||
const seedUrl = this.seeds[seedId]?.url || "";
|
||||
|
||||
const ts = new Date();
|
||||
|
||||
let seed = false;
|
||||
if (depth === 0) {
|
||||
seed = true;
|
||||
}
|
||||
|
||||
const row = { url, seedUrl, depth, seed, reason, ts: ts.toISOString() };
|
||||
const processedRow = JSON.stringify(row) + "\n";
|
||||
|
||||
if (!this.skippedPagesFH) {
|
||||
logger.error(
|
||||
"Can't write skipped pages, missing stream",
|
||||
{},
|
||||
"pageStatus",
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
this.skippedPagesFH.write(processedRow);
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
"Page append failed",
|
||||
{ pagesFile: this.skippedPagesFile },
|
||||
"pageStatus",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
|
||||
if (!sitemap) {
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -736,6 +736,13 @@ class ArgParser {
|
|||
type: "string",
|
||||
default: "Browsertrix/1.x",
|
||||
},
|
||||
|
||||
reportSkipped: {
|
||||
describe:
|
||||
"If set, write information about URLs encountered but not queued to reports/skippedPages.jsonl",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -117,3 +117,10 @@ export type CrawlStatus =
|
|||
| "canceled";
|
||||
|
||||
export const WARC_REFERS_TO_CONTAINER = "WARC-Refers-To-Container";
|
||||
|
||||
export enum SkippedReason {
|
||||
OutOfScope = "outOfScope",
|
||||
PageLimit = "pageLimit",
|
||||
RobotsTxt = "robotsTxt",
|
||||
RedirectToExcluded = "redirectToExcluded",
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ export type WACZInitOpts = {
|
|||
warcCdxDir: string;
|
||||
indexesDir: string;
|
||||
logDirectory: string;
|
||||
reportsDir?: string;
|
||||
|
||||
softwareString: string;
|
||||
|
||||
|
|
@ -102,6 +103,7 @@ export class WACZ {
|
|||
logsDir: string;
|
||||
warcCdxDir: string;
|
||||
indexesDir: string;
|
||||
reportsDir: string | null;
|
||||
|
||||
datapackage: WACZDataPackage;
|
||||
|
||||
|
|
@ -119,6 +121,7 @@ export class WACZ {
|
|||
this.warcCdxDir = config.warcCdxDir;
|
||||
this.collDir = collDir;
|
||||
this.indexesDir = config.indexesDir;
|
||||
this.reportsDir = config.reportsDir || null;
|
||||
|
||||
this.datapackage = {
|
||||
resources: [],
|
||||
|
|
@ -144,13 +147,17 @@ export class WACZ {
|
|||
}
|
||||
|
||||
generate(): Readable {
|
||||
const files = [
|
||||
const baseFiles = [
|
||||
...this.warcs,
|
||||
...addDirFiles(this.indexesDir),
|
||||
...addDirFiles(this.pagesDir),
|
||||
...addDirFiles(this.logsDir),
|
||||
];
|
||||
|
||||
const files = this.reportsDir
|
||||
? [...baseFiles, ...addDirFiles(this.reportsDir)]
|
||||
: baseFiles;
|
||||
|
||||
const zip = makeZip(
|
||||
this.iterDirForZip(files),
|
||||
) as ReadableStream<Uint8Array>;
|
||||
|
|
|
|||
166
tests/skipped_pages.test.ts
Normal file
166
tests/skipped_pages.test.ts
Normal file
|
|
@ -0,0 +1,166 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
import md5 from "md5";
|
||||
|
||||
//const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
|
||||
//const testIf = (condition: string, ...args: Parameters<typeof test>) => condition ? test(...args) : test.skip(...args);
|
||||
|
||||
test("ensure basic crawl run with docker run passes with reportSkipped option, out of scope pages", async () => {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --generateWACZ --collection skipped-pages --workers 2 --reportSkipped --limit 5",
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
"unzip test-crawls/collections/skipped-pages/skipped-pages.wacz -d test-crawls/collections/skipped-pages/wacz",
|
||||
);
|
||||
});
|
||||
|
||||
// testIf(doValidate, "validate wacz with skippedPages.jsonl", () => {
|
||||
// child_process.execSync(
|
||||
// "wacz validate --file ./test-crawls/collections/skipped-pages/skipped-pages.wacz",
|
||||
// );
|
||||
// });
|
||||
|
||||
test("ensure skippedPages.jsonl was written as expected, contains outOfScope page", () => {
|
||||
const skippedPages = fs.readFileSync(
|
||||
"test-crawls/collections/skipped-pages/reports/skippedPages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
|
||||
let pageCount = 0;
|
||||
|
||||
for (const line of skippedPages.trim().split("\n")) {
|
||||
const data = JSON.parse(line);
|
||||
if (data.format) {
|
||||
continue;
|
||||
}
|
||||
|
||||
pageCount++;
|
||||
|
||||
expect(data).toHaveProperty("url");
|
||||
expect(data).toHaveProperty("seedUrl");
|
||||
expect(data).toHaveProperty("depth");
|
||||
expect(data).toHaveProperty("reason");
|
||||
expect(data.reason).toBe("outOfScope");
|
||||
expect(data).toHaveProperty("ts");
|
||||
}
|
||||
|
||||
expect(pageCount > 0).toBe(true);
|
||||
});
|
||||
|
||||
test("ensure skippedPages.jsonl was written to wacz", () => {
|
||||
const crawlHash = md5(
|
||||
fs.readFileSync(
|
||||
"test-crawls/collections/skipped-pages/reports/skippedPages.jsonl",
|
||||
"utf8",
|
||||
),
|
||||
);
|
||||
const waczHash = md5(
|
||||
fs.readFileSync(
|
||||
"test-crawls/collections/skipped-pages/wacz/reports/skippedPages.jsonl",
|
||||
"utf8",
|
||||
),
|
||||
);
|
||||
|
||||
expect(crawlHash).toEqual(waczHash);
|
||||
});
|
||||
|
||||
test("check that skippedPages.jsonl file made it into WACZ datapackage.json", () => {
|
||||
expect(
|
||||
fs.existsSync(
|
||||
"test-crawls/collections/skipped-pages/wacz/datapackage.json",
|
||||
),
|
||||
).toBe(true);
|
||||
|
||||
const data = fs.readFileSync(
|
||||
"test-crawls/collections/skipped-pages/wacz/datapackage.json",
|
||||
"utf8",
|
||||
);
|
||||
|
||||
let found = false;
|
||||
|
||||
const dataPackageJSON = JSON.parse(data);
|
||||
const resources = dataPackageJSON.resources;
|
||||
|
||||
for (let i = 0; i < resources.length; i++) {
|
||||
const res = resources[i];
|
||||
if (res.path == "reports/skippedPages.jsonl" && res.bytes > 0) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
|
||||
expect(found).toBe(true);
|
||||
});
|
||||
|
||||
test("ensure basic crawl run with docker run passes with reportSkipped option, pageLimit report", async () => {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --extraHops 1 --reportSkipped --collection skipped-pages-2 --limit 1",
|
||||
);
|
||||
});
|
||||
|
||||
test("ensure skippedPages.jsonl was written as expected, contains pageLimit page", () => {
|
||||
const skippedPages = fs.readFileSync(
|
||||
"test-crawls/collections/skipped-pages-2/reports/skippedPages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
|
||||
let pageCount = 0;
|
||||
|
||||
for (const line of skippedPages.trim().split("\n")) {
|
||||
const data = JSON.parse(line);
|
||||
if (data.format) {
|
||||
continue;
|
||||
}
|
||||
|
||||
pageCount++;
|
||||
|
||||
expect(data).toHaveProperty("url");
|
||||
expect(data).toHaveProperty("seedUrl");
|
||||
expect(data).toHaveProperty("depth");
|
||||
expect(data).toHaveProperty("reason");
|
||||
expect(data.reason).toBe("pageLimit");
|
||||
expect(data).toHaveProperty("ts");
|
||||
}
|
||||
|
||||
expect(pageCount > 0).toBe(true);
|
||||
});
|
||||
|
||||
test("redirect to excluded page, crawl fails as no seeds crawled", () => {
|
||||
let failed = false;
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://old.webrecorder.net/ --exclude https://old.webrecorder.net/ --reportSkipped --collection skipped-pages-3",
|
||||
);
|
||||
} catch (e) {
|
||||
failed = true;
|
||||
}
|
||||
|
||||
expect(failed).toBe(true);
|
||||
});
|
||||
|
||||
test("ensure skippedPages.jsonl was written as expected, contains redirectToExcluded page", () => {
|
||||
const skippedPages = fs.readFileSync(
|
||||
"test-crawls/collections/skipped-pages-3/reports/skippedPages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
|
||||
let pageCount = 0;
|
||||
|
||||
for (const line of skippedPages.trim().split("\n")) {
|
||||
const data = JSON.parse(line);
|
||||
if (data.format) {
|
||||
continue;
|
||||
}
|
||||
|
||||
pageCount++;
|
||||
|
||||
expect(data).toHaveProperty("url");
|
||||
expect(data).toHaveProperty("seedUrl");
|
||||
expect(data).toHaveProperty("depth");
|
||||
expect(data).toHaveProperty("reason");
|
||||
expect(data.reason).toBe("redirectToExcluded");
|
||||
expect(data).toHaveProperty("ts");
|
||||
}
|
||||
|
||||
expect(pageCount > 0).toBe(true);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue