mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Apply exclusions to redirects (#745)
- if redirected page is excluded, block loading of page - mark page as excluded, don't retry, and don't write to page list - support generic blocking of pages based on initial page response - fixes #744
This commit is contained in:
parent
f7cbf9645b
commit
a00866bbab
6 changed files with 136 additions and 40 deletions
|
@ -192,6 +192,7 @@ export class Crawler {
|
||||||
| ((opts: {
|
| ((opts: {
|
||||||
page: Page;
|
page: Page;
|
||||||
data: PageState;
|
data: PageState;
|
||||||
|
seed: ScopedSeed;
|
||||||
// eslint-disable-next-line no-use-before-define
|
// eslint-disable-next-line no-use-before-define
|
||||||
crawler: Crawler;
|
crawler: Crawler;
|
||||||
}) => Promise<void>)
|
}) => Promise<void>)
|
||||||
|
@ -930,7 +931,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
async crawlPage(opts: WorkerState): Promise<void> {
|
async crawlPage(opts: WorkerState): Promise<void> {
|
||||||
await this.writeStats();
|
await this.writeStats();
|
||||||
|
|
||||||
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
|
const { page, cdp, data, workerid, callbacks, recorder } = opts;
|
||||||
data.callbacks = callbacks;
|
data.callbacks = callbacks;
|
||||||
|
|
||||||
const { url, seedId } = data;
|
const { url, seedId } = data;
|
||||||
|
@ -948,14 +949,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
data.logDetails = logDetails;
|
data.logDetails = logDetails;
|
||||||
data.workerid = workerid;
|
data.workerid = workerid;
|
||||||
|
|
||||||
if (directFetchCapture) {
|
if (recorder) {
|
||||||
try {
|
try {
|
||||||
const headers = auth
|
const headers = auth
|
||||||
? { Authorization: auth, ...this.headers }
|
? { Authorization: auth, ...this.headers }
|
||||||
: this.headers;
|
: this.headers;
|
||||||
|
|
||||||
const result = await timedRun(
|
const result = await timedRun(
|
||||||
directFetchCapture({ url, headers, cdp }),
|
recorder.directFetchCapture({ url, headers, cdp }),
|
||||||
this.params.pageLoadTimeout,
|
this.params.pageLoadTimeout,
|
||||||
"Direct fetch of page URL timed out",
|
"Direct fetch of page URL timed out",
|
||||||
logDetails,
|
logDetails,
|
||||||
|
@ -1013,11 +1014,21 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await page.setExtraHTTPHeaders({});
|
await page.setExtraHTTPHeaders({});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const seed = await this.crawlState.getSeedAt(
|
||||||
|
this.seeds,
|
||||||
|
this.numOriginalSeeds,
|
||||||
|
seedId,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (recorder) {
|
||||||
|
recorder.pageSeed = seed;
|
||||||
|
}
|
||||||
|
|
||||||
// run custom driver here, if any
|
// run custom driver here, if any
|
||||||
if (this.driver) {
|
if (this.driver) {
|
||||||
await this.driver({ page, data, crawler: this });
|
await this.driver({ page, data, crawler: this, seed });
|
||||||
} else {
|
} else {
|
||||||
await this.loadPage(page, data);
|
await this.loadPage(page, data, seed);
|
||||||
}
|
}
|
||||||
|
|
||||||
data.title = await timedRun(
|
data.title = await timedRun(
|
||||||
|
@ -1155,7 +1166,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
async pageFinished(data: PageState) {
|
async pageFinished(data: PageState) {
|
||||||
// if page loaded, considered page finished successfully
|
// if page loaded, considered page finished successfully
|
||||||
// (even if behaviors timed out)
|
// (even if behaviors timed out)
|
||||||
const { loadState, logDetails, depth, url, retry } = data;
|
const { loadState, logDetails, depth, url, retry, pageSkipped } = data;
|
||||||
|
|
||||||
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
|
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
|
||||||
await this.writePage(data);
|
await this.writePage(data);
|
||||||
|
@ -1172,11 +1183,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
await this.checkLimits();
|
await this.checkLimits();
|
||||||
} else {
|
} else {
|
||||||
if (retry >= MAX_RETRY_FAILED) {
|
if (retry >= MAX_RETRY_FAILED && !pageSkipped) {
|
||||||
await this.writePage(data);
|
await this.writePage(data);
|
||||||
}
|
}
|
||||||
await this.crawlState.markFailed(url);
|
if (pageSkipped) {
|
||||||
|
await this.crawlState.markExcluded(url);
|
||||||
|
} else {
|
||||||
|
await this.crawlState.markFailed(url);
|
||||||
|
}
|
||||||
if (this.healthChecker) {
|
if (this.healthChecker) {
|
||||||
this.healthChecker.incError();
|
this.healthChecker.incError();
|
||||||
}
|
}
|
||||||
|
@ -1861,7 +1875,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async loadPage(page: Page, data: PageState) {
|
async loadPage(page: Page, data: PageState, seed: ScopedSeed) {
|
||||||
const { url, depth } = data;
|
const { url, depth } = data;
|
||||||
|
|
||||||
const logDetails = data.logDetails;
|
const logDetails = data.logDetails;
|
||||||
|
@ -1889,8 +1903,8 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
// store the first successful non-redirect response, even if page doesn't load fully
|
// store the first successful non-redirect response, even if page doesn't load fully
|
||||||
const waitFirstResponse = (resp: HTTPResponse) => {
|
const waitFirstResponse = (resp: HTTPResponse) => {
|
||||||
firstResponse = resp;
|
if (!isRedirectStatus(resp.status())) {
|
||||||
if (!isRedirectStatus(firstResponse.status())) {
|
firstResponse = resp;
|
||||||
// don't listen to any additional responses
|
// don't listen to any additional responses
|
||||||
page.off("response", waitFirstResponse);
|
page.off("response", waitFirstResponse);
|
||||||
}
|
}
|
||||||
|
@ -1949,11 +1963,19 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
} else if (!downloadResponse) {
|
} else if (!downloadResponse) {
|
||||||
// log if not already log and rethrow, consider page failed
|
// log if not already log and rethrow, consider page failed
|
||||||
if (msg !== "logged") {
|
if (msg !== "logged") {
|
||||||
logger.error("Page Load Failed, will retry", {
|
const loadState = data.loadState;
|
||||||
msg,
|
|
||||||
loadState: data.loadState,
|
// excluded in recorder
|
||||||
...logDetails,
|
if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
|
||||||
});
|
data.pageSkipped = true;
|
||||||
|
logger.warn("Page Load Blocked, skipping", { msg, loadState });
|
||||||
|
} else {
|
||||||
|
logger.error("Page Load Failed, will retry", {
|
||||||
|
msg,
|
||||||
|
loadState,
|
||||||
|
...logDetails,
|
||||||
|
});
|
||||||
|
}
|
||||||
e.message = "logged";
|
e.message = "logged";
|
||||||
}
|
}
|
||||||
throw e;
|
throw e;
|
||||||
|
@ -2064,12 +2086,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
const { seedId, extraHops } = data;
|
const { seedId, extraHops } = data;
|
||||||
|
|
||||||
const seed = await this.crawlState.getSeedAt(
|
|
||||||
this.seeds,
|
|
||||||
this.numOriginalSeeds,
|
|
||||||
seedId,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!seed) {
|
if (!seed) {
|
||||||
logger.error(
|
logger.error(
|
||||||
"Seed not found, likely invalid crawl state - skipping link extraction and behaviors",
|
"Seed not found, likely invalid crawl state - skipping link extraction and behaviors",
|
||||||
|
|
|
@ -24,6 +24,7 @@ import { RedisCrawlState, WorkerId } from "./state.js";
|
||||||
import { CDPSession, Protocol } from "puppeteer-core";
|
import { CDPSession, Protocol } from "puppeteer-core";
|
||||||
import { Crawler } from "../crawler.js";
|
import { Crawler } from "../crawler.js";
|
||||||
import { getProxyDispatcher } from "./proxy.js";
|
import { getProxyDispatcher } from "./proxy.js";
|
||||||
|
import { ScopedSeed } from "./seeds.js";
|
||||||
|
|
||||||
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
|
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
|
||||||
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
|
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
|
||||||
|
@ -148,6 +149,8 @@ export class Recorder {
|
||||||
pageUrl!: string;
|
pageUrl!: string;
|
||||||
pageid!: string;
|
pageid!: string;
|
||||||
|
|
||||||
|
pageSeed?: ScopedSeed;
|
||||||
|
|
||||||
frameIdToExecId: Map<string, number> | null;
|
frameIdToExecId: Map<string, number> | null;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
|
@ -691,11 +694,27 @@ export class Recorder {
|
||||||
|
|
||||||
reqresp.fetchContinued = true;
|
reqresp.fetchContinued = true;
|
||||||
|
|
||||||
|
reqresp.fillFetchRequestPaused(params);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
url === this.pageUrl &&
|
url === this.pageUrl &&
|
||||||
(!this.pageInfo.ts ||
|
(!this.pageInfo.ts ||
|
||||||
(responseStatusCode && responseStatusCode < this.pageInfo.tsStatus))
|
(responseStatusCode && responseStatusCode <= this.pageInfo.tsStatus))
|
||||||
) {
|
) {
|
||||||
|
const errorReason = await this.blockPageResponse(
|
||||||
|
url,
|
||||||
|
reqresp,
|
||||||
|
responseHeaders,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (errorReason) {
|
||||||
|
await cdp.send("Fetch.failRequest", {
|
||||||
|
requestId,
|
||||||
|
errorReason,
|
||||||
|
});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
logger.debug("Setting page timestamp", {
|
logger.debug("Setting page timestamp", {
|
||||||
ts: reqresp.ts,
|
ts: reqresp.ts,
|
||||||
url,
|
url,
|
||||||
|
@ -706,8 +725,6 @@ export class Recorder {
|
||||||
this.mainFrameId = params.frameId;
|
this.mainFrameId = params.frameId;
|
||||||
}
|
}
|
||||||
|
|
||||||
reqresp.fillFetchRequestPaused(params);
|
|
||||||
|
|
||||||
if (this.noResponseForStatus(responseStatusCode)) {
|
if (this.noResponseForStatus(responseStatusCode)) {
|
||||||
reqresp.payload = new Uint8Array();
|
reqresp.payload = new Uint8Array();
|
||||||
return false;
|
return false;
|
||||||
|
@ -866,6 +883,34 @@ export class Recorder {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async blockPageResponse(
|
||||||
|
url: string,
|
||||||
|
reqresp: RequestResponseInfo,
|
||||||
|
responseHeaders?: Protocol.Fetch.HeaderEntry[],
|
||||||
|
): Promise<Protocol.Network.ErrorReason | undefined> {
|
||||||
|
if (reqresp.isRedirectStatus()) {
|
||||||
|
try {
|
||||||
|
let loc = this.getLocation(responseHeaders);
|
||||||
|
if (loc) {
|
||||||
|
loc = new URL(loc, url).href;
|
||||||
|
|
||||||
|
if (this.pageSeed && this.pageSeed.isExcluded(loc)) {
|
||||||
|
logger.warn(
|
||||||
|
"Skipping page that redirects to excluded URL",
|
||||||
|
{ newUrl: loc, origUrl: this.pageUrl },
|
||||||
|
"recorder",
|
||||||
|
);
|
||||||
|
|
||||||
|
return "BlockedByResponse";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// ignore
|
||||||
|
logger.debug("Redirect check error", e, "recorder");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
startPage({ pageid, url }: { pageid: string; url: string }) {
|
startPage({ pageid, url }: { pageid: string; url: string }) {
|
||||||
this.pageid = pageid;
|
this.pageid = pageid;
|
||||||
this.pageUrl = url;
|
this.pageUrl = url;
|
||||||
|
@ -1187,6 +1232,21 @@ export class Recorder {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected getLocation(
|
||||||
|
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
|
||||||
|
) {
|
||||||
|
if (!headers) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
for (const header of headers) {
|
||||||
|
if (header.name.toLowerCase() === "location") {
|
||||||
|
return header.value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
|
protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
|
||||||
if (!headers) {
|
if (!headers) {
|
||||||
return -1;
|
return -1;
|
||||||
|
|
|
@ -280,15 +280,23 @@ export class ScopedSeed {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.isExcluded(url)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { url, isOOS };
|
||||||
|
}
|
||||||
|
|
||||||
|
isExcluded(url: string) {
|
||||||
// check exclusions
|
// check exclusions
|
||||||
for (const e of this.exclude) {
|
for (const e of this.exclude) {
|
||||||
if (e.test(url)) {
|
if (e.test(url)) {
|
||||||
//console.log(`Skipping ${url} excluded by ${e}`);
|
//console.log(`Skipping ${url} excluded by ${e}`);
|
||||||
return false;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return { url, isOOS };
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -74,6 +74,7 @@ export class PageState {
|
||||||
favicon?: string;
|
favicon?: string;
|
||||||
|
|
||||||
skipBehaviors = false;
|
skipBehaviors = false;
|
||||||
|
pageSkipped = false;
|
||||||
filteredFrames: Frame[] = [];
|
filteredFrames: Frame[] = [];
|
||||||
loadState: LoadState = LoadState.FAILED;
|
loadState: LoadState = LoadState.FAILED;
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,7 @@ import os from "os";
|
||||||
|
|
||||||
import { logger, formatErr } from "./logger.js";
|
import { logger, formatErr } from "./logger.js";
|
||||||
import { sleep, timedRun } from "./timing.js";
|
import { sleep, timedRun } from "./timing.js";
|
||||||
import {
|
import { Recorder } from "./recorder.js";
|
||||||
DirectFetchRequest,
|
|
||||||
DirectFetchResponse,
|
|
||||||
Recorder,
|
|
||||||
} from "./recorder.js";
|
|
||||||
import { rxEscape } from "./seeds.js";
|
import { rxEscape } from "./seeds.js";
|
||||||
import { CDPSession, Page } from "puppeteer-core";
|
import { CDPSession, Page } from "puppeteer-core";
|
||||||
import { PageState, WorkerId } from "./state.js";
|
import { PageState, WorkerId } from "./state.js";
|
||||||
|
@ -24,9 +20,6 @@ export type WorkerState = {
|
||||||
workerid: WorkerId;
|
workerid: WorkerId;
|
||||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||||
callbacks: Record<string, Function>;
|
callbacks: Record<string, Function>;
|
||||||
directFetchCapture:
|
|
||||||
| ((request: DirectFetchRequest) => Promise<DirectFetchResponse>)
|
|
||||||
| null;
|
|
||||||
recorder: Recorder | null;
|
recorder: Recorder | null;
|
||||||
markPageUsed: () => void;
|
markPageUsed: () => void;
|
||||||
frameIdToExecId: Map<string, number>;
|
frameIdToExecId: Map<string, number>;
|
||||||
|
@ -175,16 +168,13 @@ export class PageWorker {
|
||||||
this.page = page;
|
this.page = page;
|
||||||
this.cdp = cdp;
|
this.cdp = cdp;
|
||||||
this.callbacks = {};
|
this.callbacks = {};
|
||||||
const directFetchCapture = this.recorder
|
|
||||||
? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
|
|
||||||
: null;
|
|
||||||
this.opts = {
|
this.opts = {
|
||||||
page,
|
page,
|
||||||
cdp,
|
cdp,
|
||||||
workerid,
|
workerid,
|
||||||
callbacks: this.callbacks,
|
callbacks: this.callbacks,
|
||||||
recorder: this.recorder,
|
recorder: this.recorder,
|
||||||
directFetchCapture,
|
|
||||||
frameIdToExecId: new Map<string, number>(),
|
frameIdToExecId: new Map<string, number>(),
|
||||||
markPageUsed: () => {
|
markPageUsed: () => {
|
||||||
if (!this.alwaysReuse) {
|
if (!this.alwaysReuse) {
|
||||||
|
|
21
tests/exclude-redirected.test.js
Normal file
21
tests/exclude-redirected.test.js
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import fs from "fs";
|
||||||
|
import { execSync } from "child_process";
|
||||||
|
|
||||||
|
// example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'
|
||||||
|
// pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded
|
||||||
|
|
||||||
|
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
|
||||||
|
execSync(
|
||||||
|
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1");
|
||||||
|
|
||||||
|
// no entries besides header
|
||||||
|
expect(
|
||||||
|
fs
|
||||||
|
.readFileSync(
|
||||||
|
"test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
).trim().split("\n").length
|
||||||
|
).toBe(1);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue