mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Retry support and additional fixes (#743)
- retries: for failed pages, set retry to 5 in cases multiple retries may be needed. - redirect: if page url is /path/ -> /path, don't add as extra seed - proxy: don't use global dispatcher, pass dispatcher explicitly when using proxy, as proxy may interfere with local network requests - final exit flag: if crawl is done and also interrupted, ensure WACZ is still written/uploaded by setting final exit to true - hashtag only change force reload: if loading page with same URL but different hashtag, eg. `https://example.com/#B` after `https://example.com/#A`, do a full reload
This commit is contained in:
parent
5d9c62e264
commit
f7cbf9645b
12 changed files with 212 additions and 74 deletions
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "1.5.0-beta.2",
|
"version": "1.5.0-beta.3",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -46,6 +46,7 @@ import {
|
||||||
ExtractSelector,
|
ExtractSelector,
|
||||||
PAGE_OP_TIMEOUT_SECS,
|
PAGE_OP_TIMEOUT_SECS,
|
||||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||||
|
MAX_RETRY_FAILED,
|
||||||
} from "./util/constants.js";
|
} from "./util/constants.js";
|
||||||
|
|
||||||
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
||||||
|
@ -1152,13 +1153,13 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
|
|
||||||
async pageFinished(data: PageState) {
|
async pageFinished(data: PageState) {
|
||||||
await this.writePage(data);
|
|
||||||
|
|
||||||
// if page loaded, considered page finished successfully
|
// if page loaded, considered page finished successfully
|
||||||
// (even if behaviors timed out)
|
// (even if behaviors timed out)
|
||||||
const { loadState, logDetails, depth, url } = data;
|
const { loadState, logDetails, depth, url, retry } = data;
|
||||||
|
|
||||||
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
|
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
|
||||||
|
await this.writePage(data);
|
||||||
|
|
||||||
logger.info("Page Finished", { loadState, ...logDetails }, "pageStatus");
|
logger.info("Page Finished", { loadState, ...logDetails }, "pageStatus");
|
||||||
|
|
||||||
await this.crawlState.markFinished(url);
|
await this.crawlState.markFinished(url);
|
||||||
|
@ -1171,6 +1172,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
await this.checkLimits();
|
await this.checkLimits();
|
||||||
} else {
|
} else {
|
||||||
|
if (retry >= MAX_RETRY_FAILED) {
|
||||||
|
await this.writePage(data);
|
||||||
|
}
|
||||||
await this.crawlState.markFailed(url);
|
await this.crawlState.markFailed(url);
|
||||||
|
|
||||||
if (this.healthChecker) {
|
if (this.healthChecker) {
|
||||||
|
@ -1370,7 +1374,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.failOnFailedLimit) {
|
if (this.params.failOnFailedLimit) {
|
||||||
const numFailed = await this.crawlState.numFailed();
|
const numFailed = await this.crawlState.numFailedWillRetry();
|
||||||
const failedLimit = this.params.failOnFailedLimit;
|
const failedLimit = this.params.failOnFailedLimit;
|
||||||
if (numFailed >= failedLimit) {
|
if (numFailed >= failedLimit) {
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
|
@ -1498,6 +1502,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.info("crawl already finished, running post-crawl tasks", {
|
logger.info("crawl already finished, running post-crawl tasks", {
|
||||||
state: initState,
|
state: initState,
|
||||||
});
|
});
|
||||||
|
this.finalExit = true;
|
||||||
await this.postCrawl();
|
await this.postCrawl();
|
||||||
return;
|
return;
|
||||||
} else if (await this.crawlState.isCrawlStopped()) {
|
} else if (await this.crawlState.isCrawlStopped()) {
|
||||||
|
@ -1581,8 +1586,11 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
await this.writeStats();
|
await this.writeStats();
|
||||||
|
|
||||||
// if crawl has been stopped, mark as final exit for post-crawl tasks
|
// if crawl has been stopped or finished, mark as final exit for post-crawl tasks
|
||||||
if (await this.crawlState.isCrawlStopped()) {
|
if (
|
||||||
|
(await this.crawlState.isCrawlStopped()) ||
|
||||||
|
(await this.crawlState.isFinished())
|
||||||
|
) {
|
||||||
this.finalExit = true;
|
this.finalExit = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1822,16 +1830,19 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
const realSize = await this.crawlState.queueSize();
|
const realSize = await this.crawlState.queueSize();
|
||||||
const pendingPages = await this.crawlState.getPendingList();
|
const pendingPages = await this.crawlState.getPendingList();
|
||||||
const done = await this.crawlState.numDone();
|
const pending = pendingPages.length;
|
||||||
const failed = await this.crawlState.numFailed();
|
const crawled = await this.crawlState.numDone();
|
||||||
const total = realSize + pendingPages.length + done;
|
const failedWillRetry = await this.crawlState.numFailedWillRetry();
|
||||||
|
const failed = await this.crawlState.numFailedNoRetry();
|
||||||
|
const total = realSize + pendingPages.length + crawled;
|
||||||
const limit = { max: this.pageLimit || 0, hit: this.limitHit };
|
const limit = { max: this.pageLimit || 0, hit: this.limitHit };
|
||||||
const stats = {
|
const stats = {
|
||||||
crawled: done,
|
crawled,
|
||||||
total: total,
|
total,
|
||||||
pending: pendingPages.length,
|
pending,
|
||||||
failed: failed,
|
failedWillRetry,
|
||||||
limit: limit,
|
failed,
|
||||||
|
limit,
|
||||||
pendingPages,
|
pendingPages,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1885,12 +1896,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
page.on("response", waitFirstResponse);
|
const handleFirstLoadEvents = () => {
|
||||||
|
page.on("response", waitFirstResponse);
|
||||||
|
|
||||||
// store that domcontentloaded was finished
|
// store that domcontentloaded was finished
|
||||||
page.once("domcontentloaded", () => {
|
page.once("domcontentloaded", () => {
|
||||||
data.loadState = LoadState.CONTENT_LOADED;
|
data.loadState = LoadState.CONTENT_LOADED;
|
||||||
});
|
});
|
||||||
|
};
|
||||||
|
|
||||||
const gotoOpts = data.isHTMLPage
|
const gotoOpts = data.isHTMLPage
|
||||||
? this.gotoOpts
|
? this.gotoOpts
|
||||||
|
@ -1898,9 +1911,24 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
logger.info("Awaiting page load", logDetails);
|
logger.info("Awaiting page load", logDetails);
|
||||||
|
|
||||||
|
const urlNoHash = url.split("#")[0];
|
||||||
|
|
||||||
|
const fullRefresh = urlNoHash === page.url().split("#")[0];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
if (!fullRefresh) {
|
||||||
|
handleFirstLoadEvents();
|
||||||
|
}
|
||||||
// store the page load response when page fully loads
|
// store the page load response when page fully loads
|
||||||
fullLoadedResponse = await page.goto(url, gotoOpts);
|
fullLoadedResponse = await page.goto(url, gotoOpts);
|
||||||
|
|
||||||
|
if (fullRefresh) {
|
||||||
|
logger.debug("Hashtag-only change, doing full page reload");
|
||||||
|
|
||||||
|
handleFirstLoadEvents();
|
||||||
|
|
||||||
|
fullLoadedResponse = await page.reload(gotoOpts);
|
||||||
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (!(e instanceof Error)) {
|
if (!(e instanceof Error)) {
|
||||||
throw e;
|
throw e;
|
||||||
|
@ -1921,7 +1949,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
} else if (!downloadResponse) {
|
} else if (!downloadResponse) {
|
||||||
// log if not already log and rethrow, consider page failed
|
// log if not already log and rethrow, consider page failed
|
||||||
if (msg !== "logged") {
|
if (msg !== "logged") {
|
||||||
logger.error("Page Load Failed, skipping page", {
|
logger.error("Page Load Failed, will retry", {
|
||||||
msg,
|
msg,
|
||||||
loadState: data.loadState,
|
loadState: data.loadState,
|
||||||
...logDetails,
|
...logDetails,
|
||||||
|
@ -1944,7 +1972,8 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
if (
|
if (
|
||||||
depth === 0 &&
|
depth === 0 &&
|
||||||
!isChromeError &&
|
!isChromeError &&
|
||||||
respUrl !== url.split("#")[0] &&
|
respUrl !== urlNoHash &&
|
||||||
|
respUrl + "/" !== url &&
|
||||||
!downloadResponse
|
!downloadResponse
|
||||||
) {
|
) {
|
||||||
data.seedId = await this.crawlState.addExtraSeed(
|
data.seedId = await this.crawlState.addExtraSeed(
|
||||||
|
@ -2652,8 +2681,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
if (this.origConfig) {
|
if (this.origConfig) {
|
||||||
this.origConfig.state = state;
|
this.origConfig.state = state;
|
||||||
}
|
}
|
||||||
const res = yaml.dump(this.origConfig, { lineWidth: -1 });
|
|
||||||
try {
|
try {
|
||||||
|
const res = yaml.dump(this.origConfig, { lineWidth: -1 });
|
||||||
logger.info(`Saving crawl state to: ${filename}`);
|
logger.info(`Saving crawl state to: ${filename}`);
|
||||||
await fsp.writeFile(filename, res);
|
await fsp.writeFile(filename, res);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
|
|
@ -5,6 +5,7 @@ import { HTTPRequest, Page } from "puppeteer-core";
|
||||||
import { Browser } from "./browser.js";
|
import { Browser } from "./browser.js";
|
||||||
|
|
||||||
import { fetch } from "undici";
|
import { fetch } from "undici";
|
||||||
|
import { getProxyDispatcher } from "./proxy.js";
|
||||||
|
|
||||||
const RULE_TYPES = ["block", "allowOnly"];
|
const RULE_TYPES = ["block", "allowOnly"];
|
||||||
|
|
||||||
|
@ -271,7 +272,7 @@ export class BlockRules {
|
||||||
logDetails: Record<string, any>,
|
logDetails: Record<string, any>,
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const res = await fetch(reqUrl);
|
const res = await fetch(reqUrl, { dispatcher: getProxyDispatcher() });
|
||||||
const text = await res.text();
|
const text = await res.text();
|
||||||
|
|
||||||
return !!text.match(frameTextMatch);
|
return !!text.match(frameTextMatch);
|
||||||
|
@ -302,6 +303,7 @@ export class BlockRules {
|
||||||
method: "PUT",
|
method: "PUT",
|
||||||
headers: { "Content-Type": "text/html" },
|
headers: { "Content-Type": "text/html" },
|
||||||
body,
|
body,
|
||||||
|
dispatcher: getProxyDispatcher(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@ export const ADD_LINK_FUNC = "__bx_addLink";
|
||||||
export const FETCH_FUNC = "__bx_fetch";
|
export const FETCH_FUNC = "__bx_fetch";
|
||||||
|
|
||||||
export const MAX_DEPTH = 1000000;
|
export const MAX_DEPTH = 1000000;
|
||||||
|
export const MAX_RETRY_FAILED = 5;
|
||||||
|
|
||||||
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
||||||
export const PAGE_OP_TIMEOUT_SECS = 5;
|
export const PAGE_OP_TIMEOUT_SECS = 5;
|
||||||
|
|
|
@ -6,6 +6,7 @@ import util from "util";
|
||||||
import { exec as execCallback } from "child_process";
|
import { exec as execCallback } from "child_process";
|
||||||
|
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
import { getProxyDispatcher } from "./proxy.js";
|
||||||
|
|
||||||
const exec = util.promisify(execCallback);
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
|
@ -85,7 +86,7 @@ async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
||||||
const behaviorFilepath = `/app/behaviors/${filename}`;
|
const behaviorFilepath = `/app/behaviors/${filename}`;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url);
|
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
|
||||||
const fileContents = await res.text();
|
const fileContents = await res.text();
|
||||||
await fsp.writeFile(behaviorFilepath, fileContents);
|
await fsp.writeFile(behaviorFilepath, fileContents);
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
|
@ -3,6 +3,7 @@ import { formatErr, logger } from "./logger.js";
|
||||||
import { Browser } from "./browser.js";
|
import { Browser } from "./browser.js";
|
||||||
|
|
||||||
import { fetch } from "undici";
|
import { fetch } from "undici";
|
||||||
|
import { getProxyDispatcher } from "./proxy.js";
|
||||||
|
|
||||||
export class OriginOverride {
|
export class OriginOverride {
|
||||||
originOverride: { origUrl: URL; destUrl: URL }[];
|
originOverride: { origUrl: URL; destUrl: URL }[];
|
||||||
|
@ -45,7 +46,10 @@ export class OriginOverride {
|
||||||
headers.set("origin", orig.origin);
|
headers.set("origin", orig.origin);
|
||||||
}
|
}
|
||||||
|
|
||||||
const resp = await fetch(newUrl, { headers });
|
const resp = await fetch(newUrl, {
|
||||||
|
headers,
|
||||||
|
dispatcher: getProxyDispatcher(),
|
||||||
|
});
|
||||||
|
|
||||||
const body = Buffer.from(await resp.arrayBuffer());
|
const body = Buffer.from(await resp.arrayBuffer());
|
||||||
const respHeaders = Object.fromEntries(resp.headers);
|
const respHeaders = Object.fromEntries(resp.headers);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import net from "net";
|
import net from "net";
|
||||||
import { Agent, Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
|
import { Agent, Dispatcher, ProxyAgent } from "undici";
|
||||||
|
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
|
||||||
|
@ -13,6 +13,8 @@ const SSH_PROXY_LOCAL_PORT = 9722;
|
||||||
|
|
||||||
const SSH_WAIT_TIMEOUT = 30000;
|
const SSH_WAIT_TIMEOUT = 30000;
|
||||||
|
|
||||||
|
let proxyDispatcher: Dispatcher | undefined = undefined;
|
||||||
|
|
||||||
export function getEnvProxyUrl() {
|
export function getEnvProxyUrl() {
|
||||||
if (process.env.PROXY_SERVER) {
|
if (process.env.PROXY_SERVER) {
|
||||||
return process.env.PROXY_SERVER;
|
return process.env.PROXY_SERVER;
|
||||||
|
@ -46,10 +48,14 @@ export async function initProxy(
|
||||||
|
|
||||||
// set global fetch() dispatcher (with proxy, if any)
|
// set global fetch() dispatcher (with proxy, if any)
|
||||||
const dispatcher = createDispatcher(proxy, agentOpts);
|
const dispatcher = createDispatcher(proxy, agentOpts);
|
||||||
setGlobalDispatcher(dispatcher);
|
proxyDispatcher = dispatcher;
|
||||||
return proxy;
|
return proxy;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function getProxyDispatcher() {
|
||||||
|
return proxyDispatcher;
|
||||||
|
}
|
||||||
|
|
||||||
export function createDispatcher(
|
export function createDispatcher(
|
||||||
proxyUrl: string,
|
proxyUrl: string,
|
||||||
opts: Agent.Options,
|
opts: Agent.Options,
|
||||||
|
|
|
@ -8,7 +8,7 @@ import {
|
||||||
isRedirectStatus,
|
isRedirectStatus,
|
||||||
} from "./reqresp.js";
|
} from "./reqresp.js";
|
||||||
|
|
||||||
import { fetch, getGlobalDispatcher, Response } from "undici";
|
import { fetch, Response } from "undici";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
getCustomRewriter,
|
getCustomRewriter,
|
||||||
|
@ -23,6 +23,7 @@ import { WARCWriter } from "./warcwriter.js";
|
||||||
import { RedisCrawlState, WorkerId } from "./state.js";
|
import { RedisCrawlState, WorkerId } from "./state.js";
|
||||||
import { CDPSession, Protocol } from "puppeteer-core";
|
import { CDPSession, Protocol } from "puppeteer-core";
|
||||||
import { Crawler } from "../crawler.js";
|
import { Crawler } from "../crawler.js";
|
||||||
|
import { getProxyDispatcher } from "./proxy.js";
|
||||||
|
|
||||||
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
|
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
|
||||||
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
|
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
|
||||||
|
@ -1588,14 +1589,18 @@ class AsyncFetcher {
|
||||||
|
|
||||||
const headers = reqresp.getRequestHeadersDict();
|
const headers = reqresp.getRequestHeadersDict();
|
||||||
|
|
||||||
const dispatcher = getGlobalDispatcher().compose((dispatch) => {
|
let dispatcher = getProxyDispatcher();
|
||||||
return (opts, handler) => {
|
|
||||||
if (opts.headers) {
|
if (dispatcher) {
|
||||||
reqresp.requestHeaders = opts.headers as Record<string, string>;
|
dispatcher = dispatcher.compose((dispatch) => {
|
||||||
}
|
return (opts, handler) => {
|
||||||
return dispatch(opts, handler);
|
if (opts.headers) {
|
||||||
};
|
reqresp.requestHeaders = opts.headers as Record<string, string>;
|
||||||
});
|
}
|
||||||
|
return dispatch(opts, handler);
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
const resp = await fetch(url!, {
|
const resp = await fetch(url!, {
|
||||||
method,
|
method,
|
||||||
|
|
|
@ -10,6 +10,7 @@ import { DETECT_SITEMAP } from "./constants.js";
|
||||||
import { sleep } from "./timing.js";
|
import { sleep } from "./timing.js";
|
||||||
|
|
||||||
import { fetch, Response } from "undici";
|
import { fetch, Response } from "undici";
|
||||||
|
import { getProxyDispatcher } from "./proxy.js";
|
||||||
|
|
||||||
const SITEMAP_CONCURRENCY = 5;
|
const SITEMAP_CONCURRENCY = 5;
|
||||||
|
|
||||||
|
@ -65,7 +66,10 @@ export class SitemapReader extends EventEmitter {
|
||||||
|
|
||||||
async _fetchWithRetry(url: string, message: string) {
|
async _fetchWithRetry(url: string, message: string) {
|
||||||
while (true) {
|
while (true) {
|
||||||
const resp = await fetch(url, { headers: this.headers });
|
const resp = await fetch(url, {
|
||||||
|
headers: this.headers,
|
||||||
|
dispatcher: getProxyDispatcher(),
|
||||||
|
});
|
||||||
|
|
||||||
if (resp.ok) {
|
if (resp.ok) {
|
||||||
return resp;
|
return resp;
|
||||||
|
|
|
@ -3,7 +3,7 @@ import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
import { MAX_DEPTH } from "./constants.js";
|
import { MAX_DEPTH, MAX_RETRY_FAILED } from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { Frame } from "puppeteer-core";
|
import { Frame } from "puppeteer-core";
|
||||||
|
|
||||||
|
@ -35,6 +35,7 @@ export type QueueEntry = {
|
||||||
extraHops: number;
|
extraHops: number;
|
||||||
ts?: number;
|
ts?: number;
|
||||||
pageid?: string;
|
pageid?: string;
|
||||||
|
retry?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -54,6 +55,7 @@ export class PageState {
|
||||||
seedId: number;
|
seedId: number;
|
||||||
depth: number;
|
depth: number;
|
||||||
extraHops: number;
|
extraHops: number;
|
||||||
|
retry: number;
|
||||||
|
|
||||||
status: number;
|
status: number;
|
||||||
|
|
||||||
|
@ -87,6 +89,7 @@ export class PageState {
|
||||||
}
|
}
|
||||||
this.pageid = redisData.pageid || uuidv4();
|
this.pageid = redisData.pageid || uuidv4();
|
||||||
this.status = 0;
|
this.status = 0;
|
||||||
|
this.retry = redisData.retry || 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,17 +118,12 @@ declare module "ioredis" {
|
||||||
uid: string,
|
uid: string,
|
||||||
): Result<void, Context>;
|
): Result<void, Context>;
|
||||||
|
|
||||||
movefailed(
|
movefailed(pkey: string, fkey: string, url: string): Result<void, Context>;
|
||||||
pkey: string,
|
|
||||||
fkey: string,
|
|
||||||
url: string,
|
|
||||||
value: string,
|
|
||||||
state: string,
|
|
||||||
): Result<void, Context>;
|
|
||||||
|
|
||||||
requeuefailed(
|
requeuefailed(
|
||||||
fkey: string,
|
fkey: string,
|
||||||
qkey: string,
|
qkey: string,
|
||||||
|
ffkey: string,
|
||||||
maxRetryPending: number,
|
maxRetryPending: number,
|
||||||
maxRegularDepth: number,
|
maxRegularDepth: number,
|
||||||
): Result<number, Context>;
|
): Result<number, Context>;
|
||||||
|
@ -170,7 +168,7 @@ export type SaveState = {
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class RedisCrawlState {
|
export class RedisCrawlState {
|
||||||
redis: Redis;
|
redis: Redis;
|
||||||
maxRetryPending = 1;
|
maxRetryPending = MAX_RETRY_FAILED;
|
||||||
|
|
||||||
uid: string;
|
uid: string;
|
||||||
key: string;
|
key: string;
|
||||||
|
@ -181,6 +179,7 @@ export class RedisCrawlState {
|
||||||
skey: string;
|
skey: string;
|
||||||
dkey: string;
|
dkey: string;
|
||||||
fkey: string;
|
fkey: string;
|
||||||
|
ffkey: string;
|
||||||
ekey: string;
|
ekey: string;
|
||||||
pageskey: string;
|
pageskey: string;
|
||||||
esKey: string;
|
esKey: string;
|
||||||
|
@ -202,6 +201,8 @@ export class RedisCrawlState {
|
||||||
this.dkey = this.key + ":d";
|
this.dkey = this.key + ":d";
|
||||||
// failed
|
// failed
|
||||||
this.fkey = this.key + ":f";
|
this.fkey = this.key + ":f";
|
||||||
|
// failed final, no more retry
|
||||||
|
this.ffkey = this.key + ":ff";
|
||||||
// crawler errors
|
// crawler errors
|
||||||
this.ekey = this.key + ":e";
|
this.ekey = this.key + ":e";
|
||||||
// pages
|
// pages
|
||||||
|
@ -283,7 +284,6 @@ local json = redis.call('hget', KEYS[1], ARGV[1]);
|
||||||
|
|
||||||
if json then
|
if json then
|
||||||
local data = cjson.decode(json);
|
local data = cjson.decode(json);
|
||||||
data[ARGV[3]] = ARGV[2];
|
|
||||||
json = cjson.encode(data);
|
json = cjson.encode(data);
|
||||||
|
|
||||||
redis.call('lpush', KEYS[2], json);
|
redis.call('lpush', KEYS[2], json);
|
||||||
|
@ -294,23 +294,25 @@ end
|
||||||
});
|
});
|
||||||
|
|
||||||
redis.defineCommand("requeuefailed", {
|
redis.defineCommand("requeuefailed", {
|
||||||
numberOfKeys: 2,
|
numberOfKeys: 3,
|
||||||
lua: `
|
lua: `
|
||||||
local json = redis.call('rpop', KEYS[1]);
|
local json = redis.call('rpop', KEYS[1]);
|
||||||
|
|
||||||
if json then
|
if json then
|
||||||
local data = cjson.decode(json);
|
local data = cjson.decode(json);
|
||||||
data['retry'] = (data['retry'] or 0) + 1;
|
data['retry'] = (data['retry'] or 0) + 1;
|
||||||
if tonumber(data['retry']) <= tonumber(ARGV[1]) then
|
|
||||||
json = cjson.encode(data);
|
if data['retry'] <= tonumber(ARGV[1]) then
|
||||||
|
local json = cjson.encode(data);
|
||||||
local score = (data['depth'] or 0) + ((data['extraHops'] or 0) * ARGV[2]);
|
local score = (data['depth'] or 0) + ((data['extraHops'] or 0) * ARGV[2]);
|
||||||
redis.call('zadd', KEYS[2], score, json);
|
redis.call('zadd', KEYS[2], score, json);
|
||||||
return 1;
|
return data['retry'];
|
||||||
else
|
else
|
||||||
return 2;
|
redis.call('lpush', KEYS[3], json);
|
||||||
|
return 0;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return 0;
|
return -1;
|
||||||
`,
|
`,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -382,9 +384,7 @@ return inx;
|
||||||
}
|
}
|
||||||
|
|
||||||
async markFailed(url: string) {
|
async markFailed(url: string) {
|
||||||
await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
|
await this.redis.movefailed(this.pkey, this.fkey, url);
|
||||||
|
|
||||||
return await this.redis.incr(this.dkey);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async markExcluded(url: string) {
|
async markExcluded(url: string) {
|
||||||
|
@ -400,7 +400,10 @@ return inx;
|
||||||
}
|
}
|
||||||
|
|
||||||
async isFinished() {
|
async isFinished() {
|
||||||
return (await this.queueSize()) == 0 && (await this.numDone()) > 0;
|
return (
|
||||||
|
(await this.queueSize()) + (await this.numFailedWillRetry()) == 0 &&
|
||||||
|
(await this.numDone()) + (await this.numFailedNoRetry()) > 0
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async setStatus(status_: string) {
|
async setStatus(status_: string) {
|
||||||
|
@ -572,25 +575,22 @@ return inx;
|
||||||
|
|
||||||
async nextFromQueue() {
|
async nextFromQueue() {
|
||||||
let json = await this._getNext();
|
let json = await this._getNext();
|
||||||
let retryFailed = false;
|
let retry = 0;
|
||||||
|
|
||||||
if (!json) {
|
if (!json) {
|
||||||
const res = await this.redis.requeuefailed(
|
retry = await this.redis.requeuefailed(
|
||||||
this.fkey,
|
this.fkey,
|
||||||
this.qkey,
|
this.qkey,
|
||||||
|
this.ffkey,
|
||||||
this.maxRetryPending,
|
this.maxRetryPending,
|
||||||
MAX_DEPTH,
|
MAX_DEPTH,
|
||||||
);
|
);
|
||||||
|
|
||||||
switch (res) {
|
if (retry > 0) {
|
||||||
case 1:
|
json = await this._getNext();
|
||||||
json = await this._getNext();
|
} else if (retry === 0) {
|
||||||
retryFailed = true;
|
logger.debug("Did not retry failed, already retried", {}, "state");
|
||||||
break;
|
return null;
|
||||||
|
|
||||||
case 2:
|
|
||||||
logger.debug("Did not retry failed, already retried", {}, "state");
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -607,8 +607,8 @@ return inx;
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (retryFailed) {
|
if (retry) {
|
||||||
logger.debug("Retring failed URL", { url: data.url }, "state");
|
logger.debug("Retrying failed URL", { url: data.url, retry }, "state");
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.markStarted(data.url);
|
await this.markStarted(data.url);
|
||||||
|
@ -626,11 +626,14 @@ return inx;
|
||||||
const seen = await this._iterSet(this.skey);
|
const seen = await this._iterSet(this.skey);
|
||||||
const queued = await this._iterSortedKey(this.qkey, seen);
|
const queued = await this._iterSortedKey(this.qkey, seen);
|
||||||
const pending = await this.getPendingList();
|
const pending = await this.getPendingList();
|
||||||
const failed = await this._iterListKeys(this.fkey, seen);
|
const failedWillRetry = await this._iterListKeys(this.fkey, seen);
|
||||||
|
const failedNoRetry = await this._iterListKeys(this.ffkey, seen);
|
||||||
const errors = await this.getErrorList();
|
const errors = await this.getErrorList();
|
||||||
const extraSeeds = await this._iterListKeys(this.esKey, seen);
|
const extraSeeds = await this._iterListKeys(this.esKey, seen);
|
||||||
const sitemapDone = await this.isSitemapDone();
|
const sitemapDone = await this.isSitemapDone();
|
||||||
|
|
||||||
|
const failed = failedWillRetry.concat(failedNoRetry);
|
||||||
|
|
||||||
const finished = [...seen.values()];
|
const finished = [...seen.values()];
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -721,6 +724,7 @@ return inx;
|
||||||
await this.redis.del(this.pkey);
|
await this.redis.del(this.pkey);
|
||||||
await this.redis.del(this.dkey);
|
await this.redis.del(this.dkey);
|
||||||
await this.redis.del(this.fkey);
|
await this.redis.del(this.fkey);
|
||||||
|
await this.redis.del(this.ffkey);
|
||||||
await this.redis.del(this.skey);
|
await this.redis.del(this.skey);
|
||||||
await this.redis.del(this.ekey);
|
await this.redis.del(this.ekey);
|
||||||
|
|
||||||
|
@ -803,7 +807,12 @@ return inx;
|
||||||
|
|
||||||
for (const json of state.failed) {
|
for (const json of state.failed) {
|
||||||
const data = JSON.parse(json);
|
const data = JSON.parse(json);
|
||||||
await this.redis.zadd(this.qkey, this._getScore(data), json);
|
const retry = data.retry || 0;
|
||||||
|
if (retry <= this.maxRetryPending) {
|
||||||
|
await this.redis.zadd(this.qkey, this._getScore(data), json);
|
||||||
|
} else {
|
||||||
|
await this.redis.rpush(this.ffkey, json);
|
||||||
|
}
|
||||||
seen.push(data.url);
|
seen.push(data.url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -831,10 +840,14 @@ return inx;
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
async numFailed() {
|
async numFailedWillRetry() {
|
||||||
return await this.redis.llen(this.fkey);
|
return await this.redis.llen(this.fkey);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async numFailedNoRetry() {
|
||||||
|
return await this.redis.llen(this.ffkey);
|
||||||
|
}
|
||||||
|
|
||||||
async getPendingList() {
|
async getPendingList() {
|
||||||
return await this.redis.hvals(this.pkey);
|
return await this.redis.hvals(this.pkey);
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,6 +50,7 @@ test("check that stats file format is correct", () => {
|
||||||
expect(dataJSON.total).toEqual(3);
|
expect(dataJSON.total).toEqual(3);
|
||||||
expect(dataJSON.pending).toEqual(0);
|
expect(dataJSON.pending).toEqual(0);
|
||||||
expect(dataJSON.failed).toEqual(0);
|
expect(dataJSON.failed).toEqual(0);
|
||||||
|
expect(dataJSON.failedWillRetry).toEqual(0);
|
||||||
expect(dataJSON.limit.max).toEqual(3);
|
expect(dataJSON.limit.max).toEqual(3);
|
||||||
expect(dataJSON.limit.hit).toBe(true);
|
expect(dataJSON.limit.hit).toBe(true);
|
||||||
expect(dataJSON.pendingPages.length).toEqual(0);
|
expect(dataJSON.pendingPages.length).toEqual(0);
|
||||||
|
|
71
tests/retry-failed.test.js
Normal file
71
tests/retry-failed.test.js
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
import { execSync, spawn } from "child_process";
|
||||||
|
import fs from "fs";
|
||||||
|
import Redis from "ioredis";
|
||||||
|
|
||||||
|
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
|
||||||
|
|
||||||
|
async function sleep(time) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, time));
|
||||||
|
}
|
||||||
|
|
||||||
|
test("run crawl", async () => {
|
||||||
|
let status = 0;
|
||||||
|
execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail`);
|
||||||
|
|
||||||
|
/*
|
||||||
|
async function runServer() {
|
||||||
|
console.log("Waiting to start server");
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
|
console.log("Starting server");
|
||||||
|
//spawn("../../node_modules/.bin/http-server", ["-p", "31501", "--username", "user", "--password", "pass"], {cwd: "./docs/site"});
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
const redis = new Redis("redis://127.0.0.1:36387/0", { lazyConnect: true, retryStrategy: () => null });
|
||||||
|
|
||||||
|
await sleep(3000);
|
||||||
|
|
||||||
|
let numRetries = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await redis.connect({
|
||||||
|
maxRetriesPerRequest: 100,
|
||||||
|
});
|
||||||
|
|
||||||
|
//runServer();
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const res = await redis.lrange("test:ff", 0, -1);
|
||||||
|
if (res.length) {
|
||||||
|
const data = JSON.parse(res);
|
||||||
|
if (data.retry) {
|
||||||
|
numRetries = data.retry;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await sleep(20);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
} finally {
|
||||||
|
expect(numRetries).toBe(5);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("check only one failed page entry is made", () => {
|
||||||
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/retry-fail/pages/pages.jsonl"),
|
||||||
|
).toBe(true);
|
||||||
|
|
||||||
|
expect(
|
||||||
|
fs
|
||||||
|
.readFileSync(
|
||||||
|
"test-crawls/collections/retry-fail/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
).trim().split("\n").length
|
||||||
|
).toBe(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue