mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
add removing option to also remove unused crawls if doing a full sync, disable by default
This commit is contained in:
parent
0d414f72f1
commit
7c37672ae9
2 changed files with 76 additions and 10 deletions
|
|
@ -9,6 +9,7 @@ import { initRedisWaitForSuccess } from "./util/redis.js";
|
||||||
import { AsyncIterReader } from "warcio";
|
import { AsyncIterReader } from "warcio";
|
||||||
import { RedisDedupeIndex } from "./util/state.js";
|
import { RedisDedupeIndex } from "./util/state.js";
|
||||||
import { basename } from "node:path";
|
import { basename } from "node:path";
|
||||||
|
import { sleep } from "./util/timing.js";
|
||||||
|
|
||||||
export type DedupeIndexEntry = {
|
export type DedupeIndexEntry = {
|
||||||
name: string;
|
name: string;
|
||||||
|
|
@ -42,6 +43,13 @@ export class CrawlIndexer {
|
||||||
type: "string",
|
type: "string",
|
||||||
required: false,
|
required: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
removing: {
|
||||||
|
describe: "If set, also remove unsued crawls/hashes from index",
|
||||||
|
type: "boolean",
|
||||||
|
required: false,
|
||||||
|
default: false,
|
||||||
|
},
|
||||||
})
|
})
|
||||||
.parseSync();
|
.parseSync();
|
||||||
}
|
}
|
||||||
|
|
@ -62,16 +70,24 @@ export class CrawlIndexer {
|
||||||
|
|
||||||
for await (const entry of this.iterWACZ({
|
for await (const entry of this.iterWACZ({
|
||||||
url: params.sourceUrl,
|
url: params.sourceUrl,
|
||||||
name: params.sourceCrawlId || params.sourceUrl,
|
name: basename(params.sourceUrl),
|
||||||
|
crawlId: params.sourceCrawlId,
|
||||||
})) {
|
})) {
|
||||||
await dedupeIndex.queueImportSource(entry.name, JSON.stringify(entry));
|
await dedupeIndex.queueImportSource(entry.name, JSON.stringify(entry));
|
||||||
|
if (params.removing && entry.crawlId) {
|
||||||
|
await dedupeIndex.markNotRemoved(entry.crawlId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let count = 0;
|
let count = 0;
|
||||||
|
let total = 0;
|
||||||
let res;
|
let res;
|
||||||
|
|
||||||
while ((res = await dedupeIndex.nextQueuedImportSource())) {
|
while ((res = await dedupeIndex.nextQueuedImportSource())) {
|
||||||
const { name, entry, total } = res;
|
const { name, entry, remaining } = res;
|
||||||
|
if (!total) {
|
||||||
|
total = remaining;
|
||||||
|
}
|
||||||
const { url, crawlId, size, hash } = JSON.parse(
|
const { url, crawlId, size, hash } = JSON.parse(
|
||||||
entry,
|
entry,
|
||||||
) as DedupeIndexEntry;
|
) as DedupeIndexEntry;
|
||||||
|
|
@ -107,7 +123,15 @@ export class CrawlIndexer {
|
||||||
await dedupeIndex.markImportSourceDone(name, crawlIdReal);
|
await dedupeIndex.markImportSourceDone(name, crawlIdReal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.removing) {
|
||||||
|
const removeset = await dedupeIndex.getRemoveSet();
|
||||||
|
if (removeset.size > 0) {
|
||||||
|
await dedupeIndex.removeCrawlIds(removeset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
logger.info("Done!");
|
logger.info("Done!");
|
||||||
|
await sleep(30);
|
||||||
await dedupeIndex.markImportFinishedTS();
|
await dedupeIndex.markImportFinishedTS();
|
||||||
process.exit(ExitCodes.Success);
|
process.exit(ExitCodes.Success);
|
||||||
}
|
}
|
||||||
|
|
@ -180,7 +204,6 @@ export class CrawlIndexer {
|
||||||
}
|
}
|
||||||
|
|
||||||
async *iterWACZ(entry: DedupeIndexEntry): AsyncIterable<DedupeIndexEntry> {
|
async *iterWACZ(entry: DedupeIndexEntry): AsyncIterable<DedupeIndexEntry> {
|
||||||
const { name } = entry;
|
|
||||||
let { url } = entry;
|
let { url } = entry;
|
||||||
let path = url;
|
let path = url;
|
||||||
|
|
||||||
|
|
@ -191,8 +214,7 @@ export class CrawlIndexer {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (path.endsWith(".wacz")) {
|
if (path.endsWith(".wacz")) {
|
||||||
console.log({ ...entry, name: basename(name || url) });
|
yield entry;
|
||||||
yield { ...entry, name: basename(name || url) };
|
|
||||||
} else if (path.endsWith(".json")) {
|
} else if (path.endsWith(".json")) {
|
||||||
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
||||||
const blob = await openAsBlob(url);
|
const blob = await openAsBlob(url);
|
||||||
|
|
|
||||||
|
|
@ -263,12 +263,12 @@ export class RedisDedupeIndex {
|
||||||
for await (const hashes of this.dedupeRedis.hscanStream(
|
for await (const hashes of this.dedupeRedis.hscanStream(
|
||||||
`h:${this.crawlId}`,
|
`h:${this.crawlId}`,
|
||||||
)) {
|
)) {
|
||||||
let value = false;
|
let isValue = false;
|
||||||
for (const hash of hashes) {
|
for (const hash of hashes) {
|
||||||
if (!value) {
|
if (!isValue) {
|
||||||
await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, this.crawlId);
|
await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, this.crawlId);
|
||||||
}
|
}
|
||||||
value = !value;
|
isValue = !isValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -371,14 +371,58 @@ export class RedisDedupeIndex {
|
||||||
|
|
||||||
await this.dedupeRedis.lrem(this.pendingQ, 1, res);
|
await this.dedupeRedis.lrem(this.pendingQ, 1, res);
|
||||||
const { name } = JSON.parse(res);
|
const { name } = JSON.parse(res);
|
||||||
const total = (await this.dedupeRedis.llen(this.sourceQ)) + 1;
|
const remaining = (await this.dedupeRedis.llen(this.sourceQ)) + 1;
|
||||||
await this.dedupeRedis.setex(this.pendingPrefix + name, "1", 300);
|
await this.dedupeRedis.setex(this.pendingPrefix + name, "1", 300);
|
||||||
return { name, entry: res, total };
|
return { name, entry: res, remaining };
|
||||||
}
|
}
|
||||||
|
|
||||||
async markImportFinishedTS() {
|
async markImportFinishedTS() {
|
||||||
await this.dedupeRedis.set("last_update_ts", new Date().toISOString());
|
await this.dedupeRedis.set("last_update_ts", new Date().toISOString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// REMOVE ON IMPORT
|
||||||
|
|
||||||
|
async markNotRemoved(crawlId: string) {
|
||||||
|
await this.dedupeRedis.sadd("noremove", crawlId);
|
||||||
|
}
|
||||||
|
|
||||||
|
async getRemoveSet() {
|
||||||
|
const removeSet = await this.dedupeRedis.sdiff(DUPE_ALL_CRAWLS, "noremove");
|
||||||
|
await this.dedupeRedis.del("noremove");
|
||||||
|
return new Set<string>(removeSet);
|
||||||
|
}
|
||||||
|
|
||||||
|
async removeCrawlIds(toRemove: Set<string>) {
|
||||||
|
for await (const hashes of this.dedupeRedis.hscanStream(
|
||||||
|
DUPE_ALL_HASH_KEY,
|
||||||
|
)) {
|
||||||
|
let isValue = false;
|
||||||
|
let key = "";
|
||||||
|
for (const hash of hashes) {
|
||||||
|
if (!isValue) {
|
||||||
|
key = hash;
|
||||||
|
}
|
||||||
|
if (key && isValue && toRemove.has(hash)) {
|
||||||
|
await this.dedupeRedis.hdel(DUPE_ALL_HASH_KEY, key);
|
||||||
|
}
|
||||||
|
isValue = !isValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const crawlId of toRemove) {
|
||||||
|
const allWACZ = await this.dedupeRedis.lrange(`c:${crawlId}:wacz`, 0, -1);
|
||||||
|
for (const waczdata of allWACZ) {
|
||||||
|
try {
|
||||||
|
const { filename } = JSON.parse(waczdata);
|
||||||
|
await this.dedupeRedis.srem(this.sourceDone, filename);
|
||||||
|
} catch (e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await this.dedupeRedis.del(`h:${crawlId}`, `c:${crawlId}:wacz`);
|
||||||
|
await this.dedupeRedis.srem(DUPE_ALL_CRAWLS, crawlId);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue