mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Exclusion Optimizations: follow-up to (#423)
Follow-up to #408 - optimized exclusion filtering: - use zscan with default count instead of ordered scan to remvoe - use glob match when possible (non-regex as determined by string check) - move isInScope() check to worker to avoid creating a page and then closing for every excluded URL - tests: update saved-state test to be more resilient to delays args: also support '--text false' for backwards compatibility, fixes webrecorder/browsertrix-cloud#1334 bump to 0.12.1
This commit is contained in:
parent
15661eb9c8
commit
dd7b926d87
6 changed files with 61 additions and 20 deletions
|
@ -481,11 +481,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
data.logDetails = logDetails;
|
||||
data.workerid = workerid;
|
||||
|
||||
if (!this.isInScope(data, logDetails)) {
|
||||
logger.info("Page no longer in scope", data);
|
||||
return true;
|
||||
}
|
||||
|
||||
// run custom driver here
|
||||
await this.driver({page, data, crawler: this});
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.12.0",
|
||||
"version": "0.12.1",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
|
|
@ -93,8 +93,7 @@ test("check crawl restarted with saved state", async () => {
|
|||
|
||||
try {
|
||||
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
|
@ -102,11 +101,22 @@ test("check crawl restarted with saved state", async () => {
|
|||
|
||||
redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
|
||||
|
||||
await redis.connect({maxRetriesPerRequest: 50});
|
||||
try {
|
||||
await redis.connect({
|
||||
maxRetriesPerRequest: 100,
|
||||
retryStrategy(times) {
|
||||
return times < 100 ? 1000 : null;
|
||||
}
|
||||
});
|
||||
|
||||
expect(await redis.get("test:d")).toBe(numDone + "");
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
|
||||
proc.kill("SIGINT");
|
||||
expect(await redis.get("test:d")).toBe(numDone + "");
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
} finally {
|
||||
proc.kill("SIGINT");
|
||||
}
|
||||
|
||||
finishProcess = wait.p;
|
||||
});
|
||||
|
|
|
@ -207,6 +207,9 @@ class ArgParser {
|
|||
if (!array.length || (array.length === 1 && array[0] === "true")) {
|
||||
return ["to-pages"];
|
||||
}
|
||||
if (array.length === 1 && array[0] === "false") {
|
||||
return [];
|
||||
}
|
||||
return coerce(array);
|
||||
}
|
||||
},
|
||||
|
|
|
@ -190,7 +190,7 @@ return 0;
|
|||
}
|
||||
|
||||
async markFinished(url) {
|
||||
await this.redis.call("hdel", this.pkey, url);
|
||||
await this.redis.hdel(this.pkey, url);
|
||||
|
||||
return await this.redis.incr(this.dkey);
|
||||
}
|
||||
|
@ -201,6 +201,12 @@ return 0;
|
|||
return await this.redis.incr(this.dkey);
|
||||
}
|
||||
|
||||
async markExcluded(url) {
|
||||
await this.redis.hdel(this.pkey, url);
|
||||
|
||||
await this.redis.srem(this.skey, url);
|
||||
}
|
||||
|
||||
recheckScope(data, seeds) {
|
||||
const seed = seeds[data.seedId];
|
||||
|
||||
|
@ -262,7 +268,7 @@ return 0;
|
|||
for (const seed of seeds) {
|
||||
seed.addExclusion(regex);
|
||||
}
|
||||
// can happen async w/o slowing down scrolling
|
||||
// can happen async w/o slowing down crawling
|
||||
// each page is still checked if in scope before crawling, even while
|
||||
// queue is being filtered
|
||||
this.filterQueue(regex);
|
||||
|
@ -284,25 +290,45 @@ return 0;
|
|||
}
|
||||
}
|
||||
|
||||
async filterQueue(regexStr) {
|
||||
isStrMatch(s) {
|
||||
// if matches original string, then consider not a regex
|
||||
return s.replace(/\\/g, "").replace(/[\\^$*+?.()|[\]{}]/g, "\\$&") === s;
|
||||
}
|
||||
|
||||
filterQueue(regexStr) {
|
||||
const regex = new RegExp(regexStr);
|
||||
|
||||
const qsize = await this.redis.zcard(this.qkey);
|
||||
let matcher = undefined;
|
||||
|
||||
const count = 50;
|
||||
// regexStr just a string, optimize by using glob matching
|
||||
if (this.isStrMatch(regexStr)) {
|
||||
matcher = {"match": `*${regexStr}*`};
|
||||
}
|
||||
|
||||
for (let i = 0; i < qsize; i += count) {
|
||||
const results = await this.redis.zrangebyscore(this.qkey, 0, "inf", "limit", i, count);
|
||||
const stream = this.redis.zscanStream(this.qkey, matcher);
|
||||
|
||||
stream.on("data", async (results) => {
|
||||
stream.pause();
|
||||
|
||||
for (const result of results) {
|
||||
const { url } = JSON.parse(result);
|
||||
if (regex.test(url)) {
|
||||
const removed = await this.redis.zrem(this.qkey, result);
|
||||
await this.redis.srem(this.skey, url);
|
||||
//if (removed) {
|
||||
await this.markExcluded(url);
|
||||
//}
|
||||
logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stream.resume();
|
||||
});
|
||||
|
||||
return new Promise(resolve => {
|
||||
stream.on("end", () => {
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async incFailCount() {
|
||||
|
|
|
@ -243,6 +243,13 @@ export class PageWorker
|
|||
|
||||
// see if any work data in the queue
|
||||
if (data) {
|
||||
// filter out any out-of-scope pages right away
|
||||
if (!this.crawler.isInScope(data, this.logDetails)) {
|
||||
logger.info("Page no longer in scope", data);
|
||||
await crawlState.markExcluded(data.url);
|
||||
continue;
|
||||
}
|
||||
|
||||
// init page (new or reuse)
|
||||
const opts = await this.initPage(data.url);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue