mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
25 lines
469 B
JavaScript
25 lines
469 B
JavaScript
![]() |
/* eslint-disable no-undef */
|
||
|
|
||
|
module.exports = async ({data, page, crawler}) => {
|
||
|
const {url} = data;
|
||
|
|
||
|
if (!await crawler.isHTML(url)) {
|
||
|
await crawler.directFetchCapture(url);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
const gotoOpts = {
|
||
|
waitUntil: crawler.params.waitUntil,
|
||
|
timeout: crawler.params.timeout
|
||
|
};
|
||
|
|
||
|
try {
|
||
|
await page.goto(url, gotoOpts);
|
||
|
} catch (e) {
|
||
|
console.log(`Load timeout for ${url}`, e);
|
||
|
}
|
||
|
|
||
|
await crawler.extractLinks(page, "a[href]");
|
||
|
};
|
||
|
|