mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
link extraction promise cleanup: (#701)
- catch frame.evaluate() directly and log errors there to avoid any possibility of exception being propagated before wrapping in timedRun() - also add clearTimeout() to timedRun() - possibly fixes openzim/zimit#376
This commit is contained in:
parent
157ac34d8c
commit
652cf9cfa6
2 changed files with 24 additions and 24 deletions
|
@ -2048,33 +2048,31 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
extract = "href",
|
||||
isAttribute = false,
|
||||
} of selectors) {
|
||||
const promiseResults = await Promise.allSettled(
|
||||
frames.map((frame) =>
|
||||
timedRun(
|
||||
frame.evaluate(loadLinks, {
|
||||
await Promise.allSettled(
|
||||
frames.map((frame) => {
|
||||
const getLinks = frame
|
||||
.evaluate(loadLinks, {
|
||||
selector,
|
||||
extract,
|
||||
isAttribute,
|
||||
addLinkFunc: ADD_LINK_FUNC,
|
||||
}),
|
||||
})
|
||||
.catch((e) =>
|
||||
logger.warn("Link Extraction failed in frame", {
|
||||
frameUrl: frame.url,
|
||||
...logDetails,
|
||||
...formatErr(e),
|
||||
}),
|
||||
);
|
||||
|
||||
return timedRun(
|
||||
getLinks,
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
"Link extraction timed out",
|
||||
logDetails,
|
||||
),
|
||||
),
|
||||
);
|
||||
}),
|
||||
);
|
||||
|
||||
for (let i = 0; i < promiseResults.length; i++) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const { status, reason } = promiseResults[i] as any;
|
||||
if (status === "rejected") {
|
||||
logger.warn("Link Extraction failed in frame", {
|
||||
reason,
|
||||
frameUrl: frames[i].url,
|
||||
...logDetails,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
logger.warn("Link Extraction failed", e, "links");
|
||||
|
|
|
@ -18,14 +18,16 @@ export function timedRun(
|
|||
// return Promise return value or log error if timeout is reached first
|
||||
const timeout = seconds * 1000;
|
||||
|
||||
let tm: NodeJS.Timeout;
|
||||
|
||||
const rejectPromiseOnTimeout = (timeout: number) => {
|
||||
return new Promise((resolve, reject) => {
|
||||
setTimeout(() => reject("timeout reached"), timeout);
|
||||
tm = setTimeout(() => reject("timeout reached"), timeout);
|
||||
});
|
||||
};
|
||||
|
||||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)]).catch(
|
||||
(err) => {
|
||||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
|
||||
.catch((err) => {
|
||||
if (err === "timeout reached") {
|
||||
const logFunc = isWarn ? logger.warn : logger.error;
|
||||
logFunc.call(
|
||||
|
@ -38,8 +40,8 @@ export function timedRun(
|
|||
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
|
||||
throw err;
|
||||
}
|
||||
},
|
||||
);
|
||||
})
|
||||
.finally(() => clearTimeout(tm));
|
||||
}
|
||||
|
||||
export function secondsElapsed(startTime: number, nowDate: Date | null = null) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue