for consistency, since excluded page isn't written to pages.jsonl, there should also be no urn:pageinfo record added for excluded pages

This commit is contained in:
Ilya Kreymer 2025-12-07 19:48:12 -08:00
parent ba4471f25b
commit 804becc5b6
2 changed files with 13 additions and 6 deletions

View file

@ -118,6 +118,7 @@ export class Recorder extends EventEmitter {
pageInfo!: PageInfoRecord;
mainFrameId: string | null = null;
skipRangeUrls!: Map<string, number>;
skipPageInfo = false;
swTargetId?: string | null;
swFrameIds = new Set<string>();
@ -743,6 +744,7 @@ export class Recorder extends EventEmitter {
);
if (errorReason) {
this.skipPageInfo = true;
await cdp.send("Fetch.failRequest", {
requestId,
errorReason,
@ -946,6 +948,7 @@ export class Recorder extends EventEmitter {
this.pendingRequests = new Map();
this.skipIds = new Set();
this.skipRangeUrls = new Map<string, number>();
this.skipPageInfo = false;
this.pageFinished = false;
this.pageInfo = {
pageid,
@ -974,6 +977,14 @@ export class Recorder extends EventEmitter {
}
writePageInfoRecord() {
if (this.skipPageInfo) {
logger.debug(
"Skipping writing pageinfo for blocked page",
{ url: "urn:pageinfo:" + this.pageUrl },
"recorder",
);
return;
}
const text = JSON.stringify(this.pageInfo, null, 2);
const url = this.pageUrl;

View file

@ -40,11 +40,7 @@ test("ensure exclusion applied on redirect URL, and URL is not requeued again",
{ encoding: "utf-8" },
);
// expect one occurence
// expect no urn:pageinfo records for excluded page
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
expect(first > 0).toBe(true);
// expect no other occurences
expect(data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`, first + 1)).toBe(-1);
expect(first < 0).toBe(true);
});