mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
for consistency, since excluded page isn't written to pages.jsonl, there should also be no urn:pageinfo record added for excluded pages
This commit is contained in:
parent
ba4471f25b
commit
804becc5b6
2 changed files with 13 additions and 6 deletions
|
|
@ -118,6 +118,7 @@ export class Recorder extends EventEmitter {
|
|||
pageInfo!: PageInfoRecord;
|
||||
mainFrameId: string | null = null;
|
||||
skipRangeUrls!: Map<string, number>;
|
||||
skipPageInfo = false;
|
||||
|
||||
swTargetId?: string | null;
|
||||
swFrameIds = new Set<string>();
|
||||
|
|
@ -743,6 +744,7 @@ export class Recorder extends EventEmitter {
|
|||
);
|
||||
|
||||
if (errorReason) {
|
||||
this.skipPageInfo = true;
|
||||
await cdp.send("Fetch.failRequest", {
|
||||
requestId,
|
||||
errorReason,
|
||||
|
|
@ -946,6 +948,7 @@ export class Recorder extends EventEmitter {
|
|||
this.pendingRequests = new Map();
|
||||
this.skipIds = new Set();
|
||||
this.skipRangeUrls = new Map<string, number>();
|
||||
this.skipPageInfo = false;
|
||||
this.pageFinished = false;
|
||||
this.pageInfo = {
|
||||
pageid,
|
||||
|
|
@ -974,6 +977,14 @@ export class Recorder extends EventEmitter {
|
|||
}
|
||||
|
||||
writePageInfoRecord() {
|
||||
if (this.skipPageInfo) {
|
||||
logger.debug(
|
||||
"Skipping writing pageinfo for blocked page",
|
||||
{ url: "urn:pageinfo:" + this.pageUrl },
|
||||
"recorder",
|
||||
);
|
||||
return;
|
||||
}
|
||||
const text = JSON.stringify(this.pageInfo, null, 2);
|
||||
|
||||
const url = this.pageUrl;
|
||||
|
|
|
|||
|
|
@ -40,11 +40,7 @@ test("ensure exclusion applied on redirect URL, and URL is not requeued again",
|
|||
{ encoding: "utf-8" },
|
||||
);
|
||||
|
||||
// expect one occurence
|
||||
// expect no urn:pageinfo records for excluded page
|
||||
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
|
||||
expect(first > 0).toBe(true);
|
||||
|
||||
// expect no other occurences
|
||||
expect(data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`, first + 1)).toBe(-1);
|
||||
|
||||
expect(first < 0).toBe(true);
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue