mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

- inject built 'behaviors.js' from browsertrix-behaviors, init with options and run - remove bgbehaviors - move textextract to root for now - add requirements.txt for python dependencies - remove obsolete --scroll option, to part of the behaviors system logging: - configure logging options via --logging param, can include 'stats' (default), 'pywb', 'behaviors', and 'behaviors-debug' - inject custom logging function for behaviors to call if either behaviors or behaviors-debug is set - 'behaviors-debug' prints all debug messages from behaviors, while regular 'behaviors' prints main behavior messages (useful for verification) dockerfile: add 'rebuild' arg to faciliate rebuilding image from specific step bump to 0.3.0-beta.0
24 lines
469 B
JavaScript
24 lines
469 B
JavaScript
/* eslint-disable no-undef */
|
|
|
|
module.exports = async ({data, page, crawler}) => {
|
|
const {url} = data;
|
|
|
|
if (!await crawler.isHTML(url)) {
|
|
await crawler.directFetchCapture(url);
|
|
return;
|
|
}
|
|
|
|
const gotoOpts = {
|
|
waitUntil: crawler.params.waitUntil,
|
|
timeout: crawler.params.timeout
|
|
};
|
|
|
|
try {
|
|
await page.goto(url, gotoOpts);
|
|
} catch (e) {
|
|
console.log(`Load timeout for ${url}`, e);
|
|
}
|
|
|
|
await crawler.extractLinks(page, "a[href]");
|
|
};
|
|
|