mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

- reduced memory usage, avoids memory leak issues caused by using playwright (see #298) - browser: split Browser into Browser and BaseBrowser - browser: puppeteer-specific functions added to Browser for additional flexibility if need to change again later - browser: use defaultArgs from playwright - browser: attempt to recover if initial target is gone - logging: add debug logging from process.memoryUsage() after every page - request interception: use priorities for cooperative request interception - request interception: move to setupPage() to run once per page, enable if any of blockrules, adblockrules or originOverrides are used - request interception: fix originOverrides enabled check, fix to work with catch-all request interception - default args: set --waitUntil back to 'load,networkidle2' - Update README with changes for puppeteer - tests: fix extra hops depth test to ensure more than one page crawled --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
12 lines
365 B
JavaScript
12 lines
365 B
JavaScript
|
|
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
|
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
|
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
|
export const MAX_DEPTH = 1000000;
|
|
|
|
export const DEFAULT_SELECTORS = [{
|
|
selector: "a[href]",
|
|
extract: "href",
|
|
isAttribute: false
|
|
}];
|
|
|