Apply exclusions to redirects (#745)

- if redirected page is excluded, block loading of page
- mark page as excluded, don't retry, and don't write to page list
- support generic blocking of pages based on initial page response
- fixes #744
This commit is contained in:
Ilya Kreymer 2025-01-28 11:28:23 -08:00 committed by GitHub
parent f7cbf9645b
commit a00866bbab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 136 additions and 40 deletions

View file

@ -192,6 +192,7 @@ export class Crawler {
| ((opts: { | ((opts: {
page: Page; page: Page;
data: PageState; data: PageState;
seed: ScopedSeed;
// eslint-disable-next-line no-use-before-define // eslint-disable-next-line no-use-before-define
crawler: Crawler; crawler: Crawler;
}) => Promise<void>) }) => Promise<void>)
@ -930,7 +931,7 @@ self.__bx_behaviors.selectMainBehavior();
async crawlPage(opts: WorkerState): Promise<void> { async crawlPage(opts: WorkerState): Promise<void> {
await this.writeStats(); await this.writeStats();
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts; const { page, cdp, data, workerid, callbacks, recorder } = opts;
data.callbacks = callbacks; data.callbacks = callbacks;
const { url, seedId } = data; const { url, seedId } = data;
@ -948,14 +949,14 @@ self.__bx_behaviors.selectMainBehavior();
data.logDetails = logDetails; data.logDetails = logDetails;
data.workerid = workerid; data.workerid = workerid;
if (directFetchCapture) { if (recorder) {
try { try {
const headers = auth const headers = auth
? { Authorization: auth, ...this.headers } ? { Authorization: auth, ...this.headers }
: this.headers; : this.headers;
const result = await timedRun( const result = await timedRun(
directFetchCapture({ url, headers, cdp }), recorder.directFetchCapture({ url, headers, cdp }),
this.params.pageLoadTimeout, this.params.pageLoadTimeout,
"Direct fetch of page URL timed out", "Direct fetch of page URL timed out",
logDetails, logDetails,
@ -1013,11 +1014,21 @@ self.__bx_behaviors.selectMainBehavior();
await page.setExtraHTTPHeaders({}); await page.setExtraHTTPHeaders({});
} }
const seed = await this.crawlState.getSeedAt(
this.seeds,
this.numOriginalSeeds,
seedId,
);
if (recorder) {
recorder.pageSeed = seed;
}
// run custom driver here, if any // run custom driver here, if any
if (this.driver) { if (this.driver) {
await this.driver({ page, data, crawler: this }); await this.driver({ page, data, crawler: this, seed });
} else { } else {
await this.loadPage(page, data); await this.loadPage(page, data, seed);
} }
data.title = await timedRun( data.title = await timedRun(
@ -1155,7 +1166,7 @@ self.__bx_behaviors.selectMainBehavior();
async pageFinished(data: PageState) { async pageFinished(data: PageState) {
// if page loaded, considered page finished successfully // if page loaded, considered page finished successfully
// (even if behaviors timed out) // (even if behaviors timed out)
const { loadState, logDetails, depth, url, retry } = data; const { loadState, logDetails, depth, url, retry, pageSkipped } = data;
if (data.loadState >= LoadState.FULL_PAGE_LOADED) { if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
await this.writePage(data); await this.writePage(data);
@ -1172,11 +1183,14 @@ self.__bx_behaviors.selectMainBehavior();
await this.checkLimits(); await this.checkLimits();
} else { } else {
if (retry >= MAX_RETRY_FAILED) { if (retry >= MAX_RETRY_FAILED && !pageSkipped) {
await this.writePage(data); await this.writePage(data);
} }
await this.crawlState.markFailed(url); if (pageSkipped) {
await this.crawlState.markExcluded(url);
} else {
await this.crawlState.markFailed(url);
}
if (this.healthChecker) { if (this.healthChecker) {
this.healthChecker.incError(); this.healthChecker.incError();
} }
@ -1861,7 +1875,7 @@ self.__bx_behaviors.selectMainBehavior();
} }
} }
async loadPage(page: Page, data: PageState) { async loadPage(page: Page, data: PageState, seed: ScopedSeed) {
const { url, depth } = data; const { url, depth } = data;
const logDetails = data.logDetails; const logDetails = data.logDetails;
@ -1889,8 +1903,8 @@ self.__bx_behaviors.selectMainBehavior();
// store the first successful non-redirect response, even if page doesn't load fully // store the first successful non-redirect response, even if page doesn't load fully
const waitFirstResponse = (resp: HTTPResponse) => { const waitFirstResponse = (resp: HTTPResponse) => {
firstResponse = resp; if (!isRedirectStatus(resp.status())) {
if (!isRedirectStatus(firstResponse.status())) { firstResponse = resp;
// don't listen to any additional responses // don't listen to any additional responses
page.off("response", waitFirstResponse); page.off("response", waitFirstResponse);
} }
@ -1949,11 +1963,19 @@ self.__bx_behaviors.selectMainBehavior();
} else if (!downloadResponse) { } else if (!downloadResponse) {
// log if not already log and rethrow, consider page failed // log if not already log and rethrow, consider page failed
if (msg !== "logged") { if (msg !== "logged") {
logger.error("Page Load Failed, will retry", { const loadState = data.loadState;
msg,
loadState: data.loadState, // excluded in recorder
...logDetails, if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
}); data.pageSkipped = true;
logger.warn("Page Load Blocked, skipping", { msg, loadState });
} else {
logger.error("Page Load Failed, will retry", {
msg,
loadState,
...logDetails,
});
}
e.message = "logged"; e.message = "logged";
} }
throw e; throw e;
@ -2064,12 +2086,6 @@ self.__bx_behaviors.selectMainBehavior();
const { seedId, extraHops } = data; const { seedId, extraHops } = data;
const seed = await this.crawlState.getSeedAt(
this.seeds,
this.numOriginalSeeds,
seedId,
);
if (!seed) { if (!seed) {
logger.error( logger.error(
"Seed not found, likely invalid crawl state - skipping link extraction and behaviors", "Seed not found, likely invalid crawl state - skipping link extraction and behaviors",

View file

@ -24,6 +24,7 @@ import { RedisCrawlState, WorkerId } from "./state.js";
import { CDPSession, Protocol } from "puppeteer-core"; import { CDPSession, Protocol } from "puppeteer-core";
import { Crawler } from "../crawler.js"; import { Crawler } from "../crawler.js";
import { getProxyDispatcher } from "./proxy.js"; import { getProxyDispatcher } from "./proxy.js";
import { ScopedSeed } from "./seeds.js";
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000; const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
const MAX_TEXT_REWRITE_SIZE = 25_000_000; const MAX_TEXT_REWRITE_SIZE = 25_000_000;
@ -148,6 +149,8 @@ export class Recorder {
pageUrl!: string; pageUrl!: string;
pageid!: string; pageid!: string;
pageSeed?: ScopedSeed;
frameIdToExecId: Map<string, number> | null; frameIdToExecId: Map<string, number> | null;
constructor({ constructor({
@ -691,11 +694,27 @@ export class Recorder {
reqresp.fetchContinued = true; reqresp.fetchContinued = true;
reqresp.fillFetchRequestPaused(params);
if ( if (
url === this.pageUrl && url === this.pageUrl &&
(!this.pageInfo.ts || (!this.pageInfo.ts ||
(responseStatusCode && responseStatusCode < this.pageInfo.tsStatus)) (responseStatusCode && responseStatusCode <= this.pageInfo.tsStatus))
) { ) {
const errorReason = await this.blockPageResponse(
url,
reqresp,
responseHeaders,
);
if (errorReason) {
await cdp.send("Fetch.failRequest", {
requestId,
errorReason,
});
return true;
}
logger.debug("Setting page timestamp", { logger.debug("Setting page timestamp", {
ts: reqresp.ts, ts: reqresp.ts,
url, url,
@ -706,8 +725,6 @@ export class Recorder {
this.mainFrameId = params.frameId; this.mainFrameId = params.frameId;
} }
reqresp.fillFetchRequestPaused(params);
if (this.noResponseForStatus(responseStatusCode)) { if (this.noResponseForStatus(responseStatusCode)) {
reqresp.payload = new Uint8Array(); reqresp.payload = new Uint8Array();
return false; return false;
@ -866,6 +883,34 @@ export class Recorder {
return true; return true;
} }
async blockPageResponse(
url: string,
reqresp: RequestResponseInfo,
responseHeaders?: Protocol.Fetch.HeaderEntry[],
): Promise<Protocol.Network.ErrorReason | undefined> {
if (reqresp.isRedirectStatus()) {
try {
let loc = this.getLocation(responseHeaders);
if (loc) {
loc = new URL(loc, url).href;
if (this.pageSeed && this.pageSeed.isExcluded(loc)) {
logger.warn(
"Skipping page that redirects to excluded URL",
{ newUrl: loc, origUrl: this.pageUrl },
"recorder",
);
return "BlockedByResponse";
}
}
} catch (e) {
// ignore
logger.debug("Redirect check error", e, "recorder");
}
}
}
startPage({ pageid, url }: { pageid: string; url: string }) { startPage({ pageid, url }: { pageid: string; url: string }) {
this.pageid = pageid; this.pageid = pageid;
this.pageUrl = url; this.pageUrl = url;
@ -1187,6 +1232,21 @@ export class Recorder {
return null; return null;
} }
protected getLocation(
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
) {
if (!headers) {
return null;
}
for (const header of headers) {
if (header.name.toLowerCase() === "location") {
return header.value;
}
}
return null;
}
protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) { protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
if (!headers) { if (!headers) {
return -1; return -1;

View file

@ -280,15 +280,23 @@ export class ScopedSeed {
} }
} }
if (this.isExcluded(url)) {
return false;
}
return { url, isOOS };
}
isExcluded(url: string) {
// check exclusions // check exclusions
for (const e of this.exclude) { for (const e of this.exclude) {
if (e.test(url)) { if (e.test(url)) {
//console.log(`Skipping ${url} excluded by ${e}`); //console.log(`Skipping ${url} excluded by ${e}`);
return false; return true;
} }
} }
return { url, isOOS }; return false;
} }
} }

View file

@ -74,6 +74,7 @@ export class PageState {
favicon?: string; favicon?: string;
skipBehaviors = false; skipBehaviors = false;
pageSkipped = false;
filteredFrames: Frame[] = []; filteredFrames: Frame[] = [];
loadState: LoadState = LoadState.FAILED; loadState: LoadState = LoadState.FAILED;

View file

@ -2,11 +2,7 @@ import os from "os";
import { logger, formatErr } from "./logger.js"; import { logger, formatErr } from "./logger.js";
import { sleep, timedRun } from "./timing.js"; import { sleep, timedRun } from "./timing.js";
import { import { Recorder } from "./recorder.js";
DirectFetchRequest,
DirectFetchResponse,
Recorder,
} from "./recorder.js";
import { rxEscape } from "./seeds.js"; import { rxEscape } from "./seeds.js";
import { CDPSession, Page } from "puppeteer-core"; import { CDPSession, Page } from "puppeteer-core";
import { PageState, WorkerId } from "./state.js"; import { PageState, WorkerId } from "./state.js";
@ -24,9 +20,6 @@ export type WorkerState = {
workerid: WorkerId; workerid: WorkerId;
// eslint-disable-next-line @typescript-eslint/ban-types // eslint-disable-next-line @typescript-eslint/ban-types
callbacks: Record<string, Function>; callbacks: Record<string, Function>;
directFetchCapture:
| ((request: DirectFetchRequest) => Promise<DirectFetchResponse>)
| null;
recorder: Recorder | null; recorder: Recorder | null;
markPageUsed: () => void; markPageUsed: () => void;
frameIdToExecId: Map<string, number>; frameIdToExecId: Map<string, number>;
@ -175,16 +168,13 @@ export class PageWorker {
this.page = page; this.page = page;
this.cdp = cdp; this.cdp = cdp;
this.callbacks = {}; this.callbacks = {};
const directFetchCapture = this.recorder
? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
: null;
this.opts = { this.opts = {
page, page,
cdp, cdp,
workerid, workerid,
callbacks: this.callbacks, callbacks: this.callbacks,
recorder: this.recorder, recorder: this.recorder,
directFetchCapture,
frameIdToExecId: new Map<string, number>(), frameIdToExecId: new Map<string, number>(),
markPageUsed: () => { markPageUsed: () => {
if (!this.alwaysReuse) { if (!this.alwaysReuse) {

View file

@ -0,0 +1,21 @@
import fs from "fs";
import { execSync } from "child_process";
// example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'
// pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
execSync(
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1");
// no entries besides header
expect(
fs
.readFileSync(
"test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",
"utf8",
).trim().split("\n").length
).toBe(1);
});