mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Exclusion Optimizations: follow-up to (#423)
Follow-up to #408 - optimized exclusion filtering: - use zscan with default count instead of ordered scan to remvoe - use glob match when possible (non-regex as determined by string check) - move isInScope() check to worker to avoid creating a page and then closing for every excluded URL - tests: update saved-state test to be more resilient to delays args: also support '--text false' for backwards compatibility, fixes webrecorder/browsertrix-cloud#1334 bump to 0.12.1
This commit is contained in:
parent
15661eb9c8
commit
dd7b926d87
6 changed files with 61 additions and 20 deletions
|
@ -481,11 +481,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
data.logDetails = logDetails;
|
data.logDetails = logDetails;
|
||||||
data.workerid = workerid;
|
data.workerid = workerid;
|
||||||
|
|
||||||
if (!this.isInScope(data, logDetails)) {
|
|
||||||
logger.info("Page no longer in scope", data);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// run custom driver here
|
// run custom driver here
|
||||||
await this.driver({page, data, crawler: this});
|
await this.driver({page, data, crawler: this});
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.12.0",
|
"version": "0.12.1",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -93,8 +93,7 @@ test("check crawl restarted with saved state", async () => {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
|
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
|
||||||
}
|
} catch (error) {
|
||||||
catch (error) {
|
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,11 +101,22 @@ test("check crawl restarted with saved state", async () => {
|
||||||
|
|
||||||
redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
|
redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
|
||||||
|
|
||||||
await redis.connect({maxRetriesPerRequest: 50});
|
try {
|
||||||
|
await redis.connect({
|
||||||
|
maxRetriesPerRequest: 100,
|
||||||
|
retryStrategy(times) {
|
||||||
|
return times < 100 ? 1000 : null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||||
|
|
||||||
expect(await redis.get("test:d")).toBe(numDone + "");
|
expect(await redis.get("test:d")).toBe(numDone + "");
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e);
|
||||||
|
} finally {
|
||||||
proc.kill("SIGINT");
|
proc.kill("SIGINT");
|
||||||
|
}
|
||||||
|
|
||||||
finishProcess = wait.p;
|
finishProcess = wait.p;
|
||||||
});
|
});
|
||||||
|
|
|
@ -207,6 +207,9 @@ class ArgParser {
|
||||||
if (!array.length || (array.length === 1 && array[0] === "true")) {
|
if (!array.length || (array.length === 1 && array[0] === "true")) {
|
||||||
return ["to-pages"];
|
return ["to-pages"];
|
||||||
}
|
}
|
||||||
|
if (array.length === 1 && array[0] === "false") {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
return coerce(array);
|
return coerce(array);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -190,7 +190,7 @@ return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
async markFinished(url) {
|
async markFinished(url) {
|
||||||
await this.redis.call("hdel", this.pkey, url);
|
await this.redis.hdel(this.pkey, url);
|
||||||
|
|
||||||
return await this.redis.incr(this.dkey);
|
return await this.redis.incr(this.dkey);
|
||||||
}
|
}
|
||||||
|
@ -201,6 +201,12 @@ return 0;
|
||||||
return await this.redis.incr(this.dkey);
|
return await this.redis.incr(this.dkey);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async markExcluded(url) {
|
||||||
|
await this.redis.hdel(this.pkey, url);
|
||||||
|
|
||||||
|
await this.redis.srem(this.skey, url);
|
||||||
|
}
|
||||||
|
|
||||||
recheckScope(data, seeds) {
|
recheckScope(data, seeds) {
|
||||||
const seed = seeds[data.seedId];
|
const seed = seeds[data.seedId];
|
||||||
|
|
||||||
|
@ -262,7 +268,7 @@ return 0;
|
||||||
for (const seed of seeds) {
|
for (const seed of seeds) {
|
||||||
seed.addExclusion(regex);
|
seed.addExclusion(regex);
|
||||||
}
|
}
|
||||||
// can happen async w/o slowing down scrolling
|
// can happen async w/o slowing down crawling
|
||||||
// each page is still checked if in scope before crawling, even while
|
// each page is still checked if in scope before crawling, even while
|
||||||
// queue is being filtered
|
// queue is being filtered
|
||||||
this.filterQueue(regex);
|
this.filterQueue(regex);
|
||||||
|
@ -284,25 +290,45 @@ return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async filterQueue(regexStr) {
|
isStrMatch(s) {
|
||||||
|
// if matches original string, then consider not a regex
|
||||||
|
return s.replace(/\\/g, "").replace(/[\\^$*+?.()|[\]{}]/g, "\\$&") === s;
|
||||||
|
}
|
||||||
|
|
||||||
|
filterQueue(regexStr) {
|
||||||
const regex = new RegExp(regexStr);
|
const regex = new RegExp(regexStr);
|
||||||
|
|
||||||
const qsize = await this.redis.zcard(this.qkey);
|
let matcher = undefined;
|
||||||
|
|
||||||
const count = 50;
|
// regexStr just a string, optimize by using glob matching
|
||||||
|
if (this.isStrMatch(regexStr)) {
|
||||||
|
matcher = {"match": `*${regexStr}*`};
|
||||||
|
}
|
||||||
|
|
||||||
for (let i = 0; i < qsize; i += count) {
|
const stream = this.redis.zscanStream(this.qkey, matcher);
|
||||||
const results = await this.redis.zrangebyscore(this.qkey, 0, "inf", "limit", i, count);
|
|
||||||
|
stream.on("data", async (results) => {
|
||||||
|
stream.pause();
|
||||||
|
|
||||||
for (const result of results) {
|
for (const result of results) {
|
||||||
const { url } = JSON.parse(result);
|
const { url } = JSON.parse(result);
|
||||||
if (regex.test(url)) {
|
if (regex.test(url)) {
|
||||||
const removed = await this.redis.zrem(this.qkey, result);
|
const removed = await this.redis.zrem(this.qkey, result);
|
||||||
await this.redis.srem(this.skey, url);
|
//if (removed) {
|
||||||
|
await this.markExcluded(url);
|
||||||
|
//}
|
||||||
logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion");
|
logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
stream.resume();
|
||||||
|
});
|
||||||
|
|
||||||
|
return new Promise(resolve => {
|
||||||
|
stream.on("end", () => {
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async incFailCount() {
|
async incFailCount() {
|
||||||
|
|
|
@ -243,6 +243,13 @@ export class PageWorker
|
||||||
|
|
||||||
// see if any work data in the queue
|
// see if any work data in the queue
|
||||||
if (data) {
|
if (data) {
|
||||||
|
// filter out any out-of-scope pages right away
|
||||||
|
if (!this.crawler.isInScope(data, this.logDetails)) {
|
||||||
|
logger.info("Page no longer in scope", data);
|
||||||
|
await crawlState.markExcluded(data.url);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// init page (new or reuse)
|
// init page (new or reuse)
|
||||||
const opts = await this.initPage(data.url);
|
const opts = await this.initPage(data.url);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue