mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
misc fixes:
- allow specifying custom redis start args via REDIS_ARGS env var, parse with splitArgsQuoteSafe() - unify checking crawl should be stopped, also check when trying to get new page - if getting new page failed, just return, avoid null dereference - support adding offset to '-X' ordinal at the end via CRAWL_INDEX_OFFSET env var
This commit is contained in:
parent
212bff0a27
commit
1a1b9b4bff
3 changed files with 37 additions and 13 deletions
|
@ -23,7 +23,8 @@ export function runWorkers(crawler, numWorkers, maxPageTime) {
|
|||
const rx = new RegExp(rxEscape(process.env.CRAWL_ID) + "\\-([\\d]+)$");
|
||||
const m = os.hostname().match(rx);
|
||||
if (m) {
|
||||
offset = m[1] * numWorkers;
|
||||
offset = Number(m[1]) + (Number(process.env.CRAWL_INDEX_OFFSET) || 0);
|
||||
offset = offset * numWorkers;
|
||||
logger.info("Starting workerid index at " + offset, "worker");
|
||||
}
|
||||
}
|
||||
|
@ -94,10 +95,10 @@ export class PageWorker
|
|||
this.reuseCount = 1;
|
||||
const workerid = this.id;
|
||||
|
||||
while (true) {
|
||||
while (await this.crawler.isCrawlRunning()) {
|
||||
try {
|
||||
logger.debug("Getting page in new window", {workerid}, "worker");
|
||||
const { page, cdp } = await timedRun(
|
||||
const result = await timedRun(
|
||||
this.crawler.browser.newWindowPageWithCDP(),
|
||||
NEW_WINDOW_TIMEOUT,
|
||||
"New Window Timed Out",
|
||||
|
@ -105,6 +106,12 @@ export class PageWorker
|
|||
"worker"
|
||||
);
|
||||
|
||||
if (!result) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const { page, cdp } = result;
|
||||
|
||||
this.page = page;
|
||||
this.cdp = cdp;
|
||||
this.opts = {page: this.page, cdp: this.cdp, workerid};
|
||||
|
@ -180,13 +187,16 @@ export class PageWorker
|
|||
async runLoop() {
|
||||
const crawlState = this.crawler.crawlState;
|
||||
|
||||
while (!this.crawler.interrupted && !await crawlState.isCrawlStopped()) {
|
||||
while (await this.crawler.isCrawlRunning()) {
|
||||
const data = await crawlState.nextFromQueue();
|
||||
|
||||
// see if any work data in the queue
|
||||
if (data) {
|
||||
// init page (new or reuse)
|
||||
const opts = await this.initPage();
|
||||
if (!opts) {
|
||||
break;
|
||||
}
|
||||
|
||||
// run timed crawl of page
|
||||
await this.timedCrawlPage({...opts, data});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue