mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Add option for sleep interval after behaviors run + timing cleanup (#257)
* Add --pageExtraDelay option to add extra delay/wait time after every page (fixes #131) * Store total page time in 'maxPageTime', include pageExtraDelay * Rename timeout->pageLoadTimeout * cleanup: - store seconds for most interval checks, convert to ms only for api calls, remove most sec<->ms conversions - add secondsElapsed() utility function to help checking time elapsed - cleanup comments --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
02fb137b2c
commit
b0e93cb06e
6 changed files with 72 additions and 48 deletions
44
README.md
44
README.md
|
@ -68,13 +68,12 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
--crawlId, --id A user provided ID for this crawl or
|
||||
crawl configuration (can also be se
|
||||
t via CRAWL_ID env var)
|
||||
[string] [default: "06bf9a4df9f7"]
|
||||
[string] [default: "ce75810e6874"]
|
||||
--newContext Deprecated as of 0.8.0, any values p
|
||||
assed will be ignored
|
||||
[string] [default: null]
|
||||
--waitUntil Puppeteer page.goto() condition to w
|
||||
ait for before continuing, can be mu
|
||||
ltiple separate by ','
|
||||
--waitUntil Playwright page.goto() condition to
|
||||
wait for before continuing
|
||||
[default: "load"]
|
||||
--depth The depth of the crawl for all seeds
|
||||
[number] [default: -1]
|
||||
|
@ -83,11 +82,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
[number] [default: 0]
|
||||
--limit Limit crawl to this number of pages
|
||||
[number] [default: 0]
|
||||
--timeout Timeout for each page to load (in se
|
||||
--pageLoadTimeout, --timeout Timeout for each page to load (in se
|
||||
conds) [number] [default: 90]
|
||||
--scopeType A predefined scope of the crawl. For
|
||||
more customization, use 'custom' and
|
||||
set scopeIncludeRx regexes
|
||||
more customization, use 'custom' an
|
||||
d set scopeIncludeRx regexes
|
||||
[string] [choices: "page", "page-spa", "prefix", "host", "domain", "any", "cus
|
||||
tom"]
|
||||
--scopeIncludeRx, --include Regex of page URLs that should be in
|
||||
|
@ -131,19 +130,20 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
--generateWACZ, --generatewacz, --ge If set, generate wacz
|
||||
nerateWacz [boolean] [default: false]
|
||||
--logging Logging options for crawler, can inc
|
||||
lude: stats, pywb, behaviors, behavi
|
||||
ors-debug, jserrors
|
||||
lude: stats (enabled by default), js
|
||||
errors, pywb, debug
|
||||
[string] [default: "stats"]
|
||||
--text If set, extract text to the pages.js
|
||||
only file [boolean] [default: false]
|
||||
onl file [boolean] [default: false]
|
||||
--cwd Crawl working directory for captures
|
||||
(pywb root). If not set, defaults t
|
||||
o process.cwd()
|
||||
[string] [default: "/crawls"]
|
||||
--mobileDevice Emulate mobile device by name from:
|
||||
https://github.com/puppeteer/puppete
|
||||
er/blob/main/src/common/DeviceDescri
|
||||
ptors.ts [string]
|
||||
https://github.com/microsoft/playwri
|
||||
ght/blob/main/packages/playwright-co
|
||||
re/src/server/deviceDescriptorsSourc
|
||||
e.json [string]
|
||||
--userAgent Override user-agent with specified s
|
||||
tring [string]
|
||||
--userAgentSuffix Append suffix to existing browser us
|
||||
|
@ -162,12 +162,16 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
age behavior will run on each page.
|
||||
If 0, a behavior can run until finis
|
||||
h. [number] [default: 90]
|
||||
--pageExtraDelay, --delay If >0, amount of time to sleep (in s
|
||||
econds) after behaviors before movin
|
||||
g on to next page
|
||||
[number] [default: 0]
|
||||
--profile Path to tar.gz file which will be ex
|
||||
tracted and used as the browser prof
|
||||
ile [string]
|
||||
--screenshot Screenshot options for crawler, can
|
||||
include: view, thumbnail, fullPage
|
||||
(comma-separated list)
|
||||
include: view, thumbnail, fullPage (
|
||||
comma-separated list)
|
||||
[string] [default: ""]
|
||||
--screencastPort If set to a non-zero value, starts a
|
||||
n HTTP server with screencast access
|
||||
|
@ -181,9 +185,10 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
o record in combined WARCs
|
||||
--redisStoreUrl If set, url for remote redis server
|
||||
to store state. Otherwise, using in-
|
||||
memory store [string]
|
||||
memory store
|
||||
[string] [default: "redis://localhost:6379/0"]
|
||||
--saveState If the crawl state should be seriali
|
||||
zed to the crawls/ directory. Default
|
||||
zed to the crawls/ directory. Defaul
|
||||
ts to 'partial', only saved when cra
|
||||
wl is interrupted
|
||||
[string] [choices: "never", "partial", "always"] [default: "partial"]
|
||||
|
@ -212,8 +217,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
--netIdleWait if set, wait for network idle after
|
||||
page load and after behaviors are do
|
||||
ne (in seconds). if -1 (default), de
|
||||
determine based on scope
|
||||
termine based on scope
|
||||
[number] [default: -1]
|
||||
--lang if set, sets the language used by th
|
||||
e browser, should be ISO 639 languag
|
||||
e[-country] code [string]
|
||||
--config Path to YAML config file
|
||||
|
||||
```
|
||||
|
|
36
crawler.js
36
crawler.js
|
@ -20,7 +20,7 @@ import { parseArgs } from "./util/argParser.js";
|
|||
import { initRedis } from "./util/redis.js";
|
||||
import { logger, errJSON } from "./util/logger.js";
|
||||
import { runWorkers } from "./util/worker.js";
|
||||
import { sleep, timedRun } from "./util/timing.js";
|
||||
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
||||
|
||||
import { Browser } from "./util/browser.js";
|
||||
|
||||
|
@ -76,7 +76,11 @@ export class Crawler {
|
|||
|
||||
this.saveStateFiles = [];
|
||||
this.lastSaveTime = 0;
|
||||
this.saveStateInterval = this.params.saveStateInterval * 1000;
|
||||
|
||||
// sum of page load + behavior timeouts + 2 x fetch + cloudflare + link extraction timeouts + extra page delay
|
||||
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
|
||||
this.maxPageTime = this.params.pageLoadTimeout + this.params.behaviorTimeout +
|
||||
FETCH_TIMEOUT_SECS*2 + PAGE_OP_TIMEOUT_SECS*2 + this.params.pageExtraDelay;
|
||||
|
||||
this.emulateDevice = this.params.emulateDevice || {};
|
||||
|
||||
|
@ -85,7 +89,7 @@ export class Crawler {
|
|||
|
||||
this.gotoOpts = {
|
||||
waitUntil: this.params.waitUntil,
|
||||
timeout: this.params.timeout
|
||||
timeout: this.params.pageLoadTimeout * 1000
|
||||
};
|
||||
|
||||
// pages directory
|
||||
|
@ -152,7 +156,9 @@ export class Crawler {
|
|||
|
||||
logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
|
||||
|
||||
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
|
||||
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
|
||||
|
||||
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.maxPageTime, os.hostname());
|
||||
|
||||
if (this.params.saveState === "always" && this.params.saveStateInterval) {
|
||||
logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
|
||||
|
@ -406,11 +412,9 @@ export class Crawler {
|
|||
} else if (data.skipBehaviors) {
|
||||
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
|
||||
} else {
|
||||
const behaviorTimeout = this.params.behaviorTimeout / 1000;
|
||||
|
||||
const res = await timedRun(
|
||||
this.runBehaviors(page, data.filteredFrames, logDetails),
|
||||
behaviorTimeout,
|
||||
this.params.behaviorTimeout,
|
||||
"Behaviors timed out",
|
||||
logDetails,
|
||||
"behavior"
|
||||
|
@ -423,6 +427,11 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
if (this.params.pageExtraDelay) {
|
||||
logger.info(`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`, logDetails);
|
||||
await sleep(this.params.pageExtraDelay);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -557,8 +566,8 @@ export class Crawler {
|
|||
}
|
||||
|
||||
if (this.params.timeLimit) {
|
||||
const elapsed = (Date.now() - this.startTime) / 1000;
|
||||
if (elapsed > this.params.timeLimit) {
|
||||
const elapsed = secondsElapsed(this.startTime);
|
||||
if (elapsed >= this.params.timeLimit) {
|
||||
logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
|
||||
interrupt = true;
|
||||
}
|
||||
|
@ -683,9 +692,10 @@ export class Crawler {
|
|||
}
|
||||
});
|
||||
|
||||
const totalPageTimeout = (this.params.behaviorTimeout + this.params.timeout) / 1000 + 60;
|
||||
|
||||
await runWorkers(this, this.params.workers, totalPageTimeout);
|
||||
// --------------
|
||||
// Run Crawl Here!
|
||||
await runWorkers(this, this.params.workers, this.maxPageTime);
|
||||
// --------------
|
||||
|
||||
await this.serializeConfig(true);
|
||||
|
||||
|
@ -1359,7 +1369,7 @@ export class Crawler {
|
|||
|
||||
if (!done) {
|
||||
// if not done, save state only after specified interval has elapsed
|
||||
if ((now.getTime() - this.lastSaveTime) < this.saveStateInterval) {
|
||||
if (secondsElapsed(this.lastSaveTime, now) < this.params.saveStateInterval) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,7 +74,8 @@ class ArgParser {
|
|||
type: "number",
|
||||
},
|
||||
|
||||
"timeout": {
|
||||
"pageLoadTimeout": {
|
||||
alias: "timeout",
|
||||
describe: "Timeout for each page to load (in seconds)",
|
||||
default: 90,
|
||||
type: "number",
|
||||
|
@ -223,6 +224,13 @@ class ArgParser {
|
|||
type: "number",
|
||||
},
|
||||
|
||||
"pageExtraDelay": {
|
||||
alias: "delay",
|
||||
describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
|
||||
default: 0,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"profile": {
|
||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
|
@ -354,10 +362,7 @@ class ArgParser {
|
|||
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
|
||||
}
|
||||
|
||||
argv.timeout *= 1000;
|
||||
|
||||
// waitUntil condition must be: load, domcontentloaded, networkidle
|
||||
// TODO: Playwright migration - for now, can only support one
|
||||
// waitUntil condition must be one of WAIT_UNTIL_OPTS: load, domcontentloaded, networkidle
|
||||
// (see: https://playwright.dev/docs/api/class-page#page-goto-option-wait-until)
|
||||
if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
|
||||
logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
||||
|
@ -385,9 +390,6 @@ class ArgParser {
|
|||
argv.behaviors = argv.behaviors.split(",");
|
||||
}
|
||||
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
|
||||
if (argv.behaviorTimeout) {
|
||||
behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
|
||||
}
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ export class PageState
|
|||
// ============================================================================
|
||||
export class RedisCrawlState
|
||||
{
|
||||
constructor(redis, key, pageTimeout, uid) {
|
||||
constructor(redis, key, maxPageTime, uid) {
|
||||
this.redis = redis;
|
||||
|
||||
this.maxRetryPending = 1;
|
||||
|
@ -48,7 +48,7 @@ export class RedisCrawlState
|
|||
|
||||
this.uid = uid;
|
||||
this.key = key;
|
||||
this.pageTimeout = pageTimeout / 1000;
|
||||
this.maxPageTime = maxPageTime;
|
||||
|
||||
this.qkey = this.key + ":q";
|
||||
this.pkey = this.key + ":p";
|
||||
|
@ -152,7 +152,7 @@ return 0;
|
|||
async markStarted(url) {
|
||||
const started = this._timestamp();
|
||||
|
||||
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.pageTimeout);
|
||||
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime);
|
||||
}
|
||||
|
||||
async markFinished(url) {
|
||||
|
|
|
@ -24,4 +24,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
|
|||
});
|
||||
}
|
||||
|
||||
export function secondsElapsed(startTime, nowDate = null) {
|
||||
nowDate = nowDate || new Date();
|
||||
|
||||
return (nowDate.getTime() - startTime) / 1000;
|
||||
}
|
||||
|
|
|
@ -6,14 +6,14 @@ const MAX_REUSE = 5;
|
|||
const NEW_WINDOW_TIMEOUT = 10;
|
||||
|
||||
// ===========================================================================
|
||||
export function runWorkers(crawler, numWorkers, timeout) {
|
||||
export function runWorkers(crawler, numWorkers, maxPageTime) {
|
||||
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
||||
|
||||
const workers = [];
|
||||
|
||||
for (let i = 0; i < numWorkers; i++) {
|
||||
//workers.push(new PageWorker(`worker-${i+1}`, crawler, timeout));
|
||||
workers.push(new PageWorker(i, crawler, timeout));
|
||||
//workers.push(new PageWorker(`worker-${i+1}`, crawler, maxPageTime));
|
||||
workers.push(new PageWorker(i, crawler, maxPageTime));
|
||||
}
|
||||
|
||||
return Promise.allSettled(workers.map((worker) => worker.run()));
|
||||
|
@ -23,10 +23,10 @@ export function runWorkers(crawler, numWorkers, timeout) {
|
|||
// ===========================================================================
|
||||
export class PageWorker
|
||||
{
|
||||
constructor(id, crawler, timeout) {
|
||||
constructor(id, crawler, maxPageTime) {
|
||||
this.id = id;
|
||||
this.crawler = crawler;
|
||||
this.timeout = timeout;
|
||||
this.maxPageTime = maxPageTime;
|
||||
|
||||
this.reuseCount = 0;
|
||||
this.page = null;
|
||||
|
@ -134,7 +134,7 @@ export class PageWorker
|
|||
await Promise.race([
|
||||
timedRun(
|
||||
this.crawler.crawlPage(opts),
|
||||
this.timeout,
|
||||
this.maxPageTime,
|
||||
"Page Worker Timeout",
|
||||
{workerid},
|
||||
"worker"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue