mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-07 13:49:47 +00:00
improvements to support pausing: (#919)
- clear size to 0 immediately after wacz is uploaded - if crawler is in paused, ensure upload of any data on startup - fetcher q: stop queuing async requests if recorder is marked for stopping
This commit is contained in:
parent
565ba54454
commit
b9b804e660
2 changed files with 23 additions and 5 deletions
|
|
@ -1554,7 +1554,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (interrupt) {
|
||||
this.uploadAndDeleteLocal = true;
|
||||
this.gracefulFinishOnInterrupt(interrupt);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
gracefulFinishOnInterrupt(interruptReason: InterruptReason) {
|
||||
|
|
@ -1691,7 +1694,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return;
|
||||
}
|
||||
|
||||
await this.checkLimits();
|
||||
if (await this.checkLimits()) {
|
||||
// if interrupted
|
||||
await this.postCrawl();
|
||||
return;
|
||||
}
|
||||
|
||||
await this.crawlState.setStatus("running");
|
||||
|
||||
|
|
@ -1869,6 +1876,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const uploaded = await this.generateWACZ();
|
||||
|
||||
if (uploaded && this.uploadAndDeleteLocal) {
|
||||
await this.crawlState.setArchiveSize(0);
|
||||
logger.info(
|
||||
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
|
||||
);
|
||||
|
|
|
|||
|
|
@ -145,6 +145,8 @@ export class Recorder extends EventEmitter {
|
|||
|
||||
shouldSaveStorage = false;
|
||||
|
||||
stopping = false;
|
||||
|
||||
constructor({
|
||||
workerid,
|
||||
writer,
|
||||
|
|
@ -857,8 +859,10 @@ export class Recorder extends EventEmitter {
|
|||
}
|
||||
|
||||
addAsyncFetch(opts: AsyncFetchOptions) {
|
||||
const fetcher = new AsyncFetcher(opts);
|
||||
void this.fetcherQ.add(() => fetcher.load());
|
||||
if (!this.stopping) {
|
||||
const fetcher = new AsyncFetcher(opts);
|
||||
void this.fetcherQ.add(() => fetcher.load());
|
||||
}
|
||||
}
|
||||
|
||||
addExternalFetch(url: string, cdp: CDPSession) {
|
||||
|
|
@ -1046,6 +1050,8 @@ export class Recorder extends EventEmitter {
|
|||
}
|
||||
|
||||
async onDone(timeout: number) {
|
||||
this.stopping = true;
|
||||
|
||||
await this.crawlState.setStatus("pending-wait");
|
||||
|
||||
const finishFetch = async () => {
|
||||
|
|
@ -1063,6 +1069,8 @@ export class Recorder extends EventEmitter {
|
|||
);
|
||||
}
|
||||
|
||||
this.fetcherQ.clear();
|
||||
|
||||
logger.debug("Finishing WARC writing", this.logDetails, "recorder");
|
||||
|
||||
await this.writer.flush();
|
||||
|
|
@ -1356,8 +1364,10 @@ export class Recorder extends EventEmitter {
|
|||
await fetcher.doCancel();
|
||||
return false;
|
||||
}
|
||||
state.asyncLoading = true;
|
||||
void this.fetcherQ.add(() => fetcher.loadDirectPage(state, crawler));
|
||||
if (!this.stopping) {
|
||||
state.asyncLoading = true;
|
||||
void this.fetcherQ.add(() => fetcher.loadDirectPage(state, crawler));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue