mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
cleanup, keep compatibility with redis 6 still
set to 'post-crawl' state after uploading
This commit is contained in:
parent
61acafd234
commit
105bd06f30
4 changed files with 37 additions and 28 deletions
|
|
@ -25,7 +25,7 @@
|
||||||
"fetch-socks": "^1.3.0",
|
"fetch-socks": "^1.3.0",
|
||||||
"get-folder-size": "^4.0.0",
|
"get-folder-size": "^4.0.0",
|
||||||
"husky": "^8.0.3",
|
"husky": "^8.0.3",
|
||||||
"ioredis": "^5.8.2",
|
"ioredis": "^5.3.2",
|
||||||
"iso-639-1": "^3.1.5",
|
"iso-639-1": "^3.1.5",
|
||||||
"js-levenshtein": "^1.1.6",
|
"js-levenshtein": "^1.1.6",
|
||||||
"js-yaml": "^4.1.0",
|
"js-yaml": "^4.1.0",
|
||||||
|
|
|
||||||
|
|
@ -203,6 +203,7 @@ export class Crawler {
|
||||||
| null = null;
|
| null = null;
|
||||||
|
|
||||||
recording: boolean;
|
recording: boolean;
|
||||||
|
deduping = false;
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
const args = this.parseArgs();
|
const args = this.parseArgs();
|
||||||
|
|
@ -344,6 +345,8 @@ export class Crawler {
|
||||||
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
||||||
const dedupRedisUrl = this.params.redisDedupUrl || redisUrl;
|
const dedupRedisUrl = this.params.redisDedupUrl || redisUrl;
|
||||||
|
|
||||||
|
this.deduping = dedupRedisUrl !== redisUrl;
|
||||||
|
|
||||||
if (!redisUrl.startsWith("redis://")) {
|
if (!redisUrl.startsWith("redis://")) {
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
|
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
|
||||||
|
|
@ -1910,10 +1913,20 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.generateWACZ && generateFiles) {
|
if (this.params.generateWACZ && generateFiles) {
|
||||||
const uploaded = await this.generateWACZ();
|
const wacz = await this.generateWACZ();
|
||||||
|
|
||||||
if (uploaded && this.uploadAndDeleteLocal) {
|
if (wacz) {
|
||||||
|
if (this.deduping) {
|
||||||
|
await this.crawlState.setStatus("post-crawl");
|
||||||
|
await this.crawlState.updateDedupSource(wacz);
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.crawlState.clearWACZFilename();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wacz && this.uploadAndDeleteLocal) {
|
||||||
await this.crawlState.setArchiveSize(0);
|
await this.crawlState.setArchiveSize(0);
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
|
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
|
||||||
);
|
);
|
||||||
|
|
@ -1962,7 +1975,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await streamFinish(logFH);
|
await streamFinish(logFH);
|
||||||
}
|
}
|
||||||
|
|
||||||
async generateWACZ() {
|
async generateWACZ(): Promise<WACZ | null> {
|
||||||
logger.info("Generating WACZ");
|
logger.info("Generating WACZ");
|
||||||
await this.crawlState.setStatus("generate-wacz");
|
await this.crawlState.setStatus("generate-wacz");
|
||||||
|
|
||||||
|
|
@ -1976,11 +1989,11 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
if (!warcFileList.length) {
|
if (!warcFileList.length) {
|
||||||
// if finished, just return
|
// if finished, just return
|
||||||
if (isFinished || (await this.crawlState.isCrawlCanceled())) {
|
if (isFinished || (await this.crawlState.isCrawlCanceled())) {
|
||||||
return;
|
return null;
|
||||||
}
|
}
|
||||||
// possibly restarted after committing, so assume done here!
|
// possibly restarted after committing, so assume done here!
|
||||||
if ((await this.crawlState.numDone()) > 0) {
|
if ((await this.crawlState.numDone()) > 0) {
|
||||||
return;
|
return null;
|
||||||
}
|
}
|
||||||
// fail crawl otherwise
|
// fail crawl otherwise
|
||||||
logger.fatal("No WARC Files, assuming crawl failed");
|
logger.fatal("No WARC Files, assuming crawl failed");
|
||||||
|
|
@ -2041,16 +2054,8 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
|
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
|
||||||
|
|
||||||
await this.crawlState.updateDedupSource(wacz);
|
return wacz;
|
||||||
|
|
||||||
await this.crawlState.clearWACZFilename();
|
|
||||||
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
await this.crawlState.updateDedupSource(wacz);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Error creating WACZ", e);
|
logger.error("Error creating WACZ", e);
|
||||||
if (!streaming) {
|
if (!streaming) {
|
||||||
|
|
@ -2059,6 +2064,8 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted");
|
await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
logMemory() {
|
logMemory() {
|
||||||
|
|
|
||||||
|
|
@ -262,13 +262,14 @@ export class RedisDedupIndex {
|
||||||
async commitDedupDone() {
|
async commitDedupDone() {
|
||||||
for await (const hashes of this.dedupRedis.hscanStream(
|
for await (const hashes of this.dedupRedis.hscanStream(
|
||||||
`h:${this.crawlId}`,
|
`h:${this.crawlId}`,
|
||||||
{
|
|
||||||
noValues: true,
|
|
||||||
},
|
|
||||||
)) {
|
)) {
|
||||||
|
let value = false;
|
||||||
for (const hash of hashes) {
|
for (const hash of hashes) {
|
||||||
|
if (!value) {
|
||||||
await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId);
|
await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId);
|
||||||
}
|
}
|
||||||
|
value = !value;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// add to crawls list
|
// add to crawls list
|
||||||
|
|
@ -1338,6 +1339,7 @@ return inx;
|
||||||
await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result));
|
await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DEPENDENT CRAWLS FOR DEDUPE
|
||||||
async addDupeCrawlRef(crawlId: string, index: string) {
|
async addDupeCrawlRef(crawlId: string, index: string) {
|
||||||
await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index);
|
await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
18
yarn.lock
18
yarn.lock
|
|
@ -370,10 +370,10 @@
|
||||||
resolved "https://registry.yarnpkg.com/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz#4a2868d75d6d6963e423bcf90b7fd1be343409d3"
|
resolved "https://registry.yarnpkg.com/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz#4a2868d75d6d6963e423bcf90b7fd1be343409d3"
|
||||||
integrity sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA==
|
integrity sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA==
|
||||||
|
|
||||||
"@ioredis/commands@1.4.0":
|
"@ioredis/commands@^1.1.1":
|
||||||
version "1.4.0"
|
version "1.2.0"
|
||||||
resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.4.0.tgz#9f657d51cdd5d2fdb8889592aa4a355546151f25"
|
resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.2.0.tgz#6d61b3097470af1fdbbe622795b8921d42018e11"
|
||||||
integrity sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ==
|
integrity sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg==
|
||||||
|
|
||||||
"@istanbuljs/load-nyc-config@^1.0.0":
|
"@istanbuljs/load-nyc-config@^1.0.0":
|
||||||
version "1.1.0"
|
version "1.1.0"
|
||||||
|
|
@ -3014,12 +3014,12 @@ intl-messageformat@^10.5.3:
|
||||||
"@formatjs/icu-messageformat-parser" "2.11.2"
|
"@formatjs/icu-messageformat-parser" "2.11.2"
|
||||||
tslib "^2.8.0"
|
tslib "^2.8.0"
|
||||||
|
|
||||||
ioredis@^5.8.2:
|
ioredis@^5.3.2:
|
||||||
version "5.8.2"
|
version "5.4.1"
|
||||||
resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.8.2.tgz#c7a228a26cf36f17a5a8011148836877780e2e14"
|
resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.4.1.tgz#1c56b70b759f01465913887375ed809134296f40"
|
||||||
integrity sha512-C6uC+kleiIMmjViJINWk80sOQw5lEzse1ZmvD+S/s8p8CWapftSaC+kocGTx6xrbrJ4WmYQGC08ffHLr6ToR6Q==
|
integrity sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==
|
||||||
dependencies:
|
dependencies:
|
||||||
"@ioredis/commands" "1.4.0"
|
"@ioredis/commands" "^1.1.1"
|
||||||
cluster-key-slot "^1.1.0"
|
cluster-key-slot "^1.1.0"
|
||||||
debug "^4.3.4"
|
debug "^4.3.4"
|
||||||
denque "^2.1.0"
|
denque "^2.1.0"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue