Ensure links added via behaviors also get processed (#478)

Requires webrecorder/browsertrix-behaviors#69 / browsertrix-behaviors
0.5.3, which will add support for behaviors to add links.

Simplify adding links by simply adding the links directly, instead of
batching to 500 links. Errors are already being logged in queueing a new
URL fails.
This commit is contained in:
Ilya Kreymer 2024-02-28 22:56:32 -08:00 committed by GitHub
parent c348de270f
commit 184f4a2395
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 8 additions and 25 deletions

View file

@ -18,7 +18,7 @@
"dependencies": { "dependencies": {
"@novnc/novnc": "^1.4.0", "@novnc/novnc": "^1.4.0",
"@webrecorder/wabac": "^2.16.12", "@webrecorder/wabac": "^2.16.12",
"browsertrix-behaviors": "^0.5.2", "browsertrix-behaviors": "^0.5.3",
"crc": "^4.3.2", "crc": "^4.3.2",
"get-folder-size": "^4.0.0", "get-folder-size": "^4.0.0",
"husky": "^8.0.3", "husky": "^8.0.3",

View file

@ -1722,17 +1722,8 @@ self.__bx_behaviors.selectMainBehavior();
) { ) {
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data; const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
let links: string[] = []; callbacks.addLink = async (url: string) => {
const promiseList = []; await this.queueInScopeUrls(seedId, [url], depth, extraHops, logDetails);
callbacks.addLink = (url: string) => {
links.push(url);
if (links.length == 500) {
promiseList.push(
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
);
links = [];
}
}; };
const loadLinks = (options: { const loadLinks = (options: {
@ -1801,14 +1792,6 @@ self.__bx_behaviors.selectMainBehavior();
} catch (e) { } catch (e) {
logger.warn("Link Extraction failed", e, "links"); logger.warn("Link Extraction failed", e, "links");
} }
if (links.length) {
promiseList.push(
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
);
}
await Promise.allSettled(promiseList);
} }
async queueInScopeUrls( async queueInScopeUrls(

View file

@ -36,7 +36,7 @@ export type QueueEntry = {
// ============================================================================ // ============================================================================
export type PageCallbacks = { export type PageCallbacks = {
addLink?: (url: string) => void; addLink?: (url: string) => Promise<void>;
}; };
// ============================================================================ // ============================================================================

View file

@ -1425,10 +1425,10 @@ browserslist@^4.21.3:
node-releases "^2.0.6" node-releases "^2.0.6"
update-browserslist-db "^1.0.9" update-browserslist-db "^1.0.9"
browsertrix-behaviors@^0.5.2: browsertrix-behaviors@^0.5.3:
version "0.5.2" version "0.5.3"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.2.tgz#d2fe1d6ff08815ff0dd68a05fe1a3cdc4bbec8ca" resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.3.tgz#f987075790b0fd970814f57195e8525277ddd2a0"
integrity sha512-8nhpnzY8OM1mxQ+mZ+m10dpGgMuhCnKUV5YUlitDpMyEfKlEybUmTz5sroVQH8e//NcJox7W6QYjaU2Y/ygxww== integrity sha512-NiVdV42xvj4DvX/z0Dxqzqsa+5e57/M7hIyK3fl41BxzOJqCgSMu0MpkrWuKpbRVo+89ZnBmzh2z6D18Vmn1LA==
bser@2.1.1: bser@2.1.1:
version "2.1.1" version "2.1.1"