mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

- retries: for failed pages, set retry to 5 in cases multiple retries may be needed. - redirect: if page url is /path/ -> /path, don't add as extra seed - proxy: don't use global dispatcher, pass dispatcher explicitly when using proxy, as proxy may interfere with local network requests - final exit flag: if crawl is done and also interrupted, ensure WACZ is still written/uploaded by setting final exit to true - hashtag only change force reload: if loading page with same URL but different hashtag, eg. `https://example.com/#B` after `https://example.com/#A`, do a full reload
58 lines
1.2 KiB
TypeScript
58 lines
1.2 KiB
TypeScript
export const HTML_TYPES = [
|
|
"text/html",
|
|
"application/xhtml",
|
|
"application/xhtml+xml",
|
|
];
|
|
export const WAIT_UNTIL_OPTS = [
|
|
"load",
|
|
"domcontentloaded",
|
|
"networkidle0",
|
|
"networkidle2",
|
|
];
|
|
|
|
export const SERVICE_WORKER_OPTS = [
|
|
"disabled",
|
|
"disabled-if-profile",
|
|
"enabled",
|
|
] as const;
|
|
|
|
export type ServiceWorkerOpt = (typeof SERVICE_WORKER_OPTS)[number];
|
|
|
|
export const DETECT_SITEMAP = "<detect>";
|
|
|
|
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
|
|
|
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
|
export const ADD_LINK_FUNC = "__bx_addLink";
|
|
export const FETCH_FUNC = "__bx_fetch";
|
|
|
|
export const MAX_DEPTH = 1000000;
|
|
export const MAX_RETRY_FAILED = 5;
|
|
|
|
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
|
export const PAGE_OP_TIMEOUT_SECS = 5;
|
|
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
|
|
|
export type ExtractSelector = {
|
|
selector: string;
|
|
extract: string;
|
|
isAttribute: boolean;
|
|
};
|
|
|
|
export const DEFAULT_SELECTORS: ExtractSelector[] = [
|
|
{
|
|
selector: "a[href]",
|
|
extract: "href",
|
|
isAttribute: false,
|
|
},
|
|
];
|
|
|
|
export const BEHAVIOR_TYPES = [
|
|
"autoplay",
|
|
"autofetch",
|
|
"autoscroll",
|
|
"autoclick",
|
|
"siteSpecific",
|
|
];
|
|
|
|
export const DISPLAY = ":99";
|