Add option for sleep interval after behaviors run + timing cleanup (#257)

* Add --pageExtraDelay option to add extra delay/wait time after every page (fixes #131)

* Store total page time in 'maxPageTime', include pageExtraDelay

* Rename timeout->pageLoadTimeout

* cleanup:
- store seconds for most interval checks, convert to ms only for api calls, remove most sec<->ms conversions
- add secondsElapsed() utility function to help checking time elapsed
- cleanup comments

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2023-03-22 14:50:18 -04:00 committed by GitHub
parent 02fb137b2c
commit b0e93cb06e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 72 additions and 48 deletions

View file

@ -68,13 +68,12 @@ Browsertrix Crawler includes a number of additional command-line options, explai
--crawlId, --id A user provided ID for this crawl or
crawl configuration (can also be se
t via CRAWL_ID env var)
[string] [default: "06bf9a4df9f7"]
[string] [default: "ce75810e6874"]
--newContext Deprecated as of 0.8.0, any values p
assed will be ignored
[string] [default: null]
--waitUntil Puppeteer page.goto() condition to w
ait for before continuing, can be mu
ltiple separate by ','
--waitUntil Playwright page.goto() condition to
wait for before continuing
[default: "load"]
--depth The depth of the crawl for all seeds
[number] [default: -1]
@ -83,11 +82,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
[number] [default: 0]
--limit Limit crawl to this number of pages
[number] [default: 0]
--timeout Timeout for each page to load (in se
--pageLoadTimeout, --timeout Timeout for each page to load (in se
conds) [number] [default: 90]
--scopeType A predefined scope of the crawl. For
more customization, use 'custom' and
set scopeIncludeRx regexes
more customization, use 'custom' an
d set scopeIncludeRx regexes
[string] [choices: "page", "page-spa", "prefix", "host", "domain", "any", "cus
tom"]
--scopeIncludeRx, --include Regex of page URLs that should be in
@ -131,19 +130,20 @@ Browsertrix Crawler includes a number of additional command-line options, explai
--generateWACZ, --generatewacz, --ge If set, generate wacz
nerateWacz [boolean] [default: false]
--logging Logging options for crawler, can inc
lude: stats, pywb, behaviors, behavi
ors-debug, jserrors
lude: stats (enabled by default), js
errors, pywb, debug
[string] [default: "stats"]
--text If set, extract text to the pages.js
only file [boolean] [default: false]
onl file [boolean] [default: false]
--cwd Crawl working directory for captures
(pywb root). If not set, defaults t
o process.cwd()
[string] [default: "/crawls"]
--mobileDevice Emulate mobile device by name from:
https://github.com/puppeteer/puppete
er/blob/main/src/common/DeviceDescri
ptors.ts [string]
https://github.com/microsoft/playwri
ght/blob/main/packages/playwright-co
re/src/server/deviceDescriptorsSourc
e.json [string]
--userAgent Override user-agent with specified s
tring [string]
--userAgentSuffix Append suffix to existing browser us
@ -162,12 +162,16 @@ Browsertrix Crawler includes a number of additional command-line options, explai
age behavior will run on each page.
If 0, a behavior can run until finis
h. [number] [default: 90]
--pageExtraDelay, --delay If >0, amount of time to sleep (in s
econds) after behaviors before movin
g on to next page
[number] [default: 0]
--profile Path to tar.gz file which will be ex
tracted and used as the browser prof
ile [string]
--screenshot Screenshot options for crawler, can
include: view, thumbnail, fullPage
(comma-separated list)
include: view, thumbnail, fullPage (
comma-separated list)
[string] [default: ""]
--screencastPort If set to a non-zero value, starts a
n HTTP server with screencast access
@ -181,9 +185,10 @@ Browsertrix Crawler includes a number of additional command-line options, explai
o record in combined WARCs
--redisStoreUrl If set, url for remote redis server
to store state. Otherwise, using in-
memory store [string]
memory store
[string] [default: "redis://localhost:6379/0"]
--saveState If the crawl state should be seriali
zed to the crawls/ directory. Default
zed to the crawls/ directory. Defaul
ts to 'partial', only saved when cra
wl is interrupted
[string] [choices: "never", "partial", "always"] [default: "partial"]
@ -212,8 +217,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
--netIdleWait if set, wait for network idle after
page load and after behaviors are do
ne (in seconds). if -1 (default), de
determine based on scope
termine based on scope
[number] [default: -1]
--lang if set, sets the language used by th
e browser, should be ISO 639 languag
e[-country] code [string]
--config Path to YAML config file
```

View file

@ -20,7 +20,7 @@ import { parseArgs } from "./util/argParser.js";
import { initRedis } from "./util/redis.js";
import { logger, errJSON } from "./util/logger.js";
import { runWorkers } from "./util/worker.js";
import { sleep, timedRun } from "./util/timing.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { Browser } from "./util/browser.js";
@ -76,7 +76,11 @@ export class Crawler {
this.saveStateFiles = [];
this.lastSaveTime = 0;
this.saveStateInterval = this.params.saveStateInterval * 1000;
// sum of page load + behavior timeouts + 2 x fetch + cloudflare + link extraction timeouts + extra page delay
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
this.maxPageTime = this.params.pageLoadTimeout + this.params.behaviorTimeout +
FETCH_TIMEOUT_SECS*2 + PAGE_OP_TIMEOUT_SECS*2 + this.params.pageExtraDelay;
this.emulateDevice = this.params.emulateDevice || {};
@ -85,7 +89,7 @@ export class Crawler {
this.gotoOpts = {
waitUntil: this.params.waitUntil,
timeout: this.params.timeout
timeout: this.params.pageLoadTimeout * 1000
};
// pages directory
@ -152,7 +156,9 @@ export class Crawler {
logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.maxPageTime, os.hostname());
if (this.params.saveState === "always" && this.params.saveStateInterval) {
logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
@ -406,11 +412,9 @@ export class Crawler {
} else if (data.skipBehaviors) {
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
} else {
const behaviorTimeout = this.params.behaviorTimeout / 1000;
const res = await timedRun(
this.runBehaviors(page, data.filteredFrames, logDetails),
behaviorTimeout,
this.params.behaviorTimeout,
"Behaviors timed out",
logDetails,
"behavior"
@ -423,6 +427,11 @@ export class Crawler {
}
}
if (this.params.pageExtraDelay) {
logger.info(`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`, logDetails);
await sleep(this.params.pageExtraDelay);
}
return true;
}
@ -557,8 +566,8 @@ export class Crawler {
}
if (this.params.timeLimit) {
const elapsed = (Date.now() - this.startTime) / 1000;
if (elapsed > this.params.timeLimit) {
const elapsed = secondsElapsed(this.startTime);
if (elapsed >= this.params.timeLimit) {
logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
interrupt = true;
}
@ -683,9 +692,10 @@ export class Crawler {
}
});
const totalPageTimeout = (this.params.behaviorTimeout + this.params.timeout) / 1000 + 60;
await runWorkers(this, this.params.workers, totalPageTimeout);
// --------------
// Run Crawl Here!
await runWorkers(this, this.params.workers, this.maxPageTime);
// --------------
await this.serializeConfig(true);
@ -1359,7 +1369,7 @@ export class Crawler {
if (!done) {
// if not done, save state only after specified interval has elapsed
if ((now.getTime() - this.lastSaveTime) < this.saveStateInterval) {
if (secondsElapsed(this.lastSaveTime, now) < this.params.saveStateInterval) {
return;
}
}

View file

@ -74,7 +74,8 @@ class ArgParser {
type: "number",
},
"timeout": {
"pageLoadTimeout": {
alias: "timeout",
describe: "Timeout for each page to load (in seconds)",
default: 90,
type: "number",
@ -223,6 +224,13 @@ class ArgParser {
type: "number",
},
"pageExtraDelay": {
alias: "delay",
describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
default: 0,
type: "number",
},
"profile": {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
@ -354,10 +362,7 @@ class ArgParser {
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
}
argv.timeout *= 1000;
// waitUntil condition must be: load, domcontentloaded, networkidle
// TODO: Playwright migration - for now, can only support one
// waitUntil condition must be one of WAIT_UNTIL_OPTS: load, domcontentloaded, networkidle
// (see: https://playwright.dev/docs/api/class-page#page-goto-option-wait-until)
if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
@ -385,9 +390,6 @@ class ArgParser {
argv.behaviors = argv.behaviors.split(",");
}
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
if (argv.behaviorTimeout) {
behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
}
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
argv.behaviorOpts = JSON.stringify(behaviorOpts);

View file

@ -39,7 +39,7 @@ export class PageState
// ============================================================================
export class RedisCrawlState
{
constructor(redis, key, pageTimeout, uid) {
constructor(redis, key, maxPageTime, uid) {
this.redis = redis;
this.maxRetryPending = 1;
@ -48,7 +48,7 @@ export class RedisCrawlState
this.uid = uid;
this.key = key;
this.pageTimeout = pageTimeout / 1000;
this.maxPageTime = maxPageTime;
this.qkey = this.key + ":q";
this.pkey = this.key + ":p";
@ -152,7 +152,7 @@ return 0;
async markStarted(url) {
const started = this._timestamp();
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.pageTimeout);
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime);
}
async markFinished(url) {

View file

@ -24,4 +24,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
});
}
export function secondsElapsed(startTime, nowDate = null) {
nowDate = nowDate || new Date();
return (nowDate.getTime() - startTime) / 1000;
}

View file

@ -6,14 +6,14 @@ const MAX_REUSE = 5;
const NEW_WINDOW_TIMEOUT = 10;
// ===========================================================================
export function runWorkers(crawler, numWorkers, timeout) {
export function runWorkers(crawler, numWorkers, maxPageTime) {
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
const workers = [];
for (let i = 0; i < numWorkers; i++) {
//workers.push(new PageWorker(`worker-${i+1}`, crawler, timeout));
workers.push(new PageWorker(i, crawler, timeout));
//workers.push(new PageWorker(`worker-${i+1}`, crawler, maxPageTime));
workers.push(new PageWorker(i, crawler, maxPageTime));
}
return Promise.allSettled(workers.map((worker) => worker.run()));
@ -23,10 +23,10 @@ export function runWorkers(crawler, numWorkers, timeout) {
// ===========================================================================
export class PageWorker
{
constructor(id, crawler, timeout) {
constructor(id, crawler, maxPageTime) {
this.id = id;
this.crawler = crawler;
this.timeout = timeout;
this.maxPageTime = maxPageTime;
this.reuseCount = 0;
this.page = null;
@ -134,7 +134,7 @@ export class PageWorker
await Promise.race([
timedRun(
this.crawler.crawlPage(opts),
this.timeout,
this.maxPageTime,
"Page Worker Timeout",
{workerid},
"worker"