mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
seed url issues: log duplicate seeds or limit reached when adding seeds only once
This commit is contained in:
parent
8d87293932
commit
13f940aa49
1 changed files with 11 additions and 0 deletions
|
@ -129,6 +129,8 @@ export class Crawler {
|
|||
limitHit = false;
|
||||
pageLimit: number;
|
||||
|
||||
dupeSeedsFound = false;
|
||||
|
||||
saveStateFiles: string[] = [];
|
||||
lastSaveTime: number;
|
||||
|
||||
|
@ -2466,10 +2468,19 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
{ url, ...logDetails },
|
||||
"links",
|
||||
);
|
||||
if (!this.limitHit && depth === 0) {
|
||||
logger.error(
|
||||
"Page limit reached when adding URL list, some URLs not crawled.",
|
||||
);
|
||||
}
|
||||
this.limitHit = true;
|
||||
return false;
|
||||
|
||||
case QueueState.DUPE_URL:
|
||||
if (!this.dupeSeedsFound && depth === 0) {
|
||||
logger.error("Duplicate seed URLs found and skipped");
|
||||
this.dupeSeedsFound = true;
|
||||
}
|
||||
logger.debug(
|
||||
"Page URL not queued, already seen",
|
||||
{ url, ...logDetails },
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue