mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Ensure links added via behaviors also get processed (#478)
Requires webrecorder/browsertrix-behaviors#69 / browsertrix-behaviors 0.5.3, which will add support for behaviors to add links. Simplify adding links by simply adding the links directly, instead of batching to 500 links. Errors are already being logged in queueing a new URL fails.
This commit is contained in:
parent
c348de270f
commit
184f4a2395
4 changed files with 8 additions and 25 deletions
|
@ -18,7 +18,7 @@
|
|||
"dependencies": {
|
||||
"@novnc/novnc": "^1.4.0",
|
||||
"@webrecorder/wabac": "^2.16.12",
|
||||
"browsertrix-behaviors": "^0.5.2",
|
||||
"browsertrix-behaviors": "^0.5.3",
|
||||
"crc": "^4.3.2",
|
||||
"get-folder-size": "^4.0.0",
|
||||
"husky": "^8.0.3",
|
||||
|
|
|
@ -1722,17 +1722,8 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
) {
|
||||
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
||||
|
||||
let links: string[] = [];
|
||||
const promiseList = [];
|
||||
|
||||
callbacks.addLink = (url: string) => {
|
||||
links.push(url);
|
||||
if (links.length == 500) {
|
||||
promiseList.push(
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
|
||||
);
|
||||
links = [];
|
||||
}
|
||||
callbacks.addLink = async (url: string) => {
|
||||
await this.queueInScopeUrls(seedId, [url], depth, extraHops, logDetails);
|
||||
};
|
||||
|
||||
const loadLinks = (options: {
|
||||
|
@ -1801,14 +1792,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
} catch (e) {
|
||||
logger.warn("Link Extraction failed", e, "links");
|
||||
}
|
||||
|
||||
if (links.length) {
|
||||
promiseList.push(
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
|
||||
);
|
||||
}
|
||||
|
||||
await Promise.allSettled(promiseList);
|
||||
}
|
||||
|
||||
async queueInScopeUrls(
|
||||
|
|
|
@ -36,7 +36,7 @@ export type QueueEntry = {
|
|||
|
||||
// ============================================================================
|
||||
export type PageCallbacks = {
|
||||
addLink?: (url: string) => void;
|
||||
addLink?: (url: string) => Promise<void>;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
|
|
|
@ -1425,10 +1425,10 @@ browserslist@^4.21.3:
|
|||
node-releases "^2.0.6"
|
||||
update-browserslist-db "^1.0.9"
|
||||
|
||||
browsertrix-behaviors@^0.5.2:
|
||||
version "0.5.2"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.2.tgz#d2fe1d6ff08815ff0dd68a05fe1a3cdc4bbec8ca"
|
||||
integrity sha512-8nhpnzY8OM1mxQ+mZ+m10dpGgMuhCnKUV5YUlitDpMyEfKlEybUmTz5sroVQH8e//NcJox7W6QYjaU2Y/ygxww==
|
||||
browsertrix-behaviors@^0.5.3:
|
||||
version "0.5.3"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.3.tgz#f987075790b0fd970814f57195e8525277ddd2a0"
|
||||
integrity sha512-NiVdV42xvj4DvX/z0Dxqzqsa+5e57/M7hIyK3fl41BxzOJqCgSMu0MpkrWuKpbRVo+89ZnBmzh2z6D18Vmn1LA==
|
||||
|
||||
bser@2.1.1:
|
||||
version "2.1.1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue