Exclusion Optimizations: follow-up to (#423)

Follow-up to #408 - optimized exclusion filtering:
- use zscan with default count instead of ordered scan to remvoe
- use glob match when possible (non-regex as determined by string check)
- move isInScope() check to worker to avoid creating a page and then
closing for every excluded URL
- tests: update saved-state test to be more resilient to delays

args: also support '--text false' for backwards compatibility, fixes
webrecorder/browsertrix-cloud#1334

bump to 0.12.1
This commit is contained in:
Ilya Kreymer 2023-11-03 15:15:09 -07:00 committed by GitHub
parent 15661eb9c8
commit dd7b926d87
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 61 additions and 20 deletions

View file

@ -481,11 +481,6 @@ self.__bx_behaviors.selectMainBehavior();
data.logDetails = logDetails; data.logDetails = logDetails;
data.workerid = workerid; data.workerid = workerid;
if (!this.isInScope(data, logDetails)) {
logger.info("Page no longer in scope", data);
return true;
}
// run custom driver here // run custom driver here
await this.driver({page, data, crawler: this}); await this.driver({page, data, crawler: this});

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "0.12.0", "version": "0.12.1",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"type": "module", "type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -93,8 +93,7 @@ test("check crawl restarted with saved state", async () => {
try { try {
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback); proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
} } catch (error) {
catch (error) {
console.log(error); console.log(error);
} }
@ -102,11 +101,22 @@ test("check crawl restarted with saved state", async () => {
redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true}); redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
await redis.connect({maxRetriesPerRequest: 50}); try {
await redis.connect({
maxRetriesPerRequest: 100,
retryStrategy(times) {
return times < 100 ? 1000 : null;
}
});
await new Promise((resolve) => setTimeout(resolve, 2000));
expect(await redis.get("test:d")).toBe(numDone + ""); expect(await redis.get("test:d")).toBe(numDone + "");
} catch (e) {
console.log(e);
} finally {
proc.kill("SIGINT"); proc.kill("SIGINT");
}
finishProcess = wait.p; finishProcess = wait.p;
}); });

View file

@ -207,6 +207,9 @@ class ArgParser {
if (!array.length || (array.length === 1 && array[0] === "true")) { if (!array.length || (array.length === 1 && array[0] === "true")) {
return ["to-pages"]; return ["to-pages"];
} }
if (array.length === 1 && array[0] === "false") {
return [];
}
return coerce(array); return coerce(array);
} }
}, },

View file

@ -190,7 +190,7 @@ return 0;
} }
async markFinished(url) { async markFinished(url) {
await this.redis.call("hdel", this.pkey, url); await this.redis.hdel(this.pkey, url);
return await this.redis.incr(this.dkey); return await this.redis.incr(this.dkey);
} }
@ -201,6 +201,12 @@ return 0;
return await this.redis.incr(this.dkey); return await this.redis.incr(this.dkey);
} }
async markExcluded(url) {
await this.redis.hdel(this.pkey, url);
await this.redis.srem(this.skey, url);
}
recheckScope(data, seeds) { recheckScope(data, seeds) {
const seed = seeds[data.seedId]; const seed = seeds[data.seedId];
@ -262,7 +268,7 @@ return 0;
for (const seed of seeds) { for (const seed of seeds) {
seed.addExclusion(regex); seed.addExclusion(regex);
} }
// can happen async w/o slowing down scrolling // can happen async w/o slowing down crawling
// each page is still checked if in scope before crawling, even while // each page is still checked if in scope before crawling, even while
// queue is being filtered // queue is being filtered
this.filterQueue(regex); this.filterQueue(regex);
@ -284,25 +290,45 @@ return 0;
} }
} }
async filterQueue(regexStr) { isStrMatch(s) {
// if matches original string, then consider not a regex
return s.replace(/\\/g, "").replace(/[\\^$*+?.()|[\]{}]/g, "\\$&") === s;
}
filterQueue(regexStr) {
const regex = new RegExp(regexStr); const regex = new RegExp(regexStr);
const qsize = await this.redis.zcard(this.qkey); let matcher = undefined;
const count = 50; // regexStr just a string, optimize by using glob matching
if (this.isStrMatch(regexStr)) {
matcher = {"match": `*${regexStr}*`};
}
for (let i = 0; i < qsize; i += count) { const stream = this.redis.zscanStream(this.qkey, matcher);
const results = await this.redis.zrangebyscore(this.qkey, 0, "inf", "limit", i, count);
stream.on("data", async (results) => {
stream.pause();
for (const result of results) { for (const result of results) {
const { url } = JSON.parse(result); const { url } = JSON.parse(result);
if (regex.test(url)) { if (regex.test(url)) {
const removed = await this.redis.zrem(this.qkey, result); const removed = await this.redis.zrem(this.qkey, result);
await this.redis.srem(this.skey, url); //if (removed) {
await this.markExcluded(url);
//}
logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion"); logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion");
} }
} }
}
stream.resume();
});
return new Promise(resolve => {
stream.on("end", () => {
resolve();
});
});
} }
async incFailCount() { async incFailCount() {

View file

@ -243,6 +243,13 @@ export class PageWorker
// see if any work data in the queue // see if any work data in the queue
if (data) { if (data) {
// filter out any out-of-scope pages right away
if (!this.crawler.isInScope(data, this.logDetails)) {
logger.info("Page no longer in scope", data);
await crawlState.markExcluded(data.url);
continue;
}
// init page (new or reuse) // init page (new or reuse)
const opts = await this.initPage(data.url); const opts = await this.initPage(data.url);