mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Async Fetch Refactor (#880)
- separate out reading stream response while browser is waiting (not really async) from actual async loading, this is not handled via fetchResponseBody() - unify async fetch into first trying browser networking for regular GET, fallback to regular fetch() - load headers and body separately in async fetch, allowing for cancelling request after headers - refactor direct fetch of non-html pages: load headers and handle loading body, adding page async, allowing worker to continue loading browser-based pages (should allow more parallelization in the future) - unify WARC writing in preparation for dedup: unified serializeWARC() called for all paths, WARC digest computed, additional checks for payload added for streaming loading
This commit is contained in:
parent
a42c0b926e
commit
705bc0cd9f
4 changed files with 524 additions and 551 deletions
|
@ -1059,58 +1059,43 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
data.logDetails = logDetails;
|
data.logDetails = logDetails;
|
||||||
data.workerid = workerid;
|
data.workerid = workerid;
|
||||||
|
|
||||||
|
let result = false;
|
||||||
|
|
||||||
if (recorder) {
|
if (recorder) {
|
||||||
try {
|
try {
|
||||||
const headers = auth
|
const headers = auth
|
||||||
? { Authorization: auth, ...this.headers }
|
? { Authorization: auth, ...this.headers }
|
||||||
: this.headers;
|
: this.headers;
|
||||||
|
|
||||||
const result = await timedRun(
|
result = await timedRun(
|
||||||
recorder.directFetchCapture({ url, headers, cdp }),
|
recorder.directFetchCapture({
|
||||||
|
url,
|
||||||
|
headers,
|
||||||
|
cdp,
|
||||||
|
state: data,
|
||||||
|
crawler: this,
|
||||||
|
}),
|
||||||
this.params.pageLoadTimeout,
|
this.params.pageLoadTimeout,
|
||||||
"Direct fetch of page URL timed out",
|
"Direct fetch of page URL timed out",
|
||||||
logDetails,
|
logDetails,
|
||||||
"fetch",
|
"fetch",
|
||||||
);
|
);
|
||||||
|
|
||||||
// fetched timed out, already logged, don't retry in browser
|
|
||||||
if (!result) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const { fetched, mime, ts } = result;
|
|
||||||
|
|
||||||
if (mime) {
|
|
||||||
data.mime = mime;
|
|
||||||
data.isHTMLPage = isHTMLMime(mime);
|
|
||||||
}
|
|
||||||
if (fetched) {
|
|
||||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
|
||||||
data.status = 200;
|
|
||||||
data.ts = ts || new Date();
|
|
||||||
logger.info(
|
|
||||||
"Direct fetch successful",
|
|
||||||
{ url, mime, ...logDetails },
|
|
||||||
"fetch",
|
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e instanceof Error && e.message === "response-filtered-out") {
|
logger.error(
|
||||||
// filtered out direct fetch
|
"Direct fetch of page URL failed",
|
||||||
logger.debug(
|
{ e, ...logDetails },
|
||||||
"Direct fetch response not accepted, continuing with browser fetch",
|
"fetch",
|
||||||
logDetails,
|
);
|
||||||
"fetch",
|
}
|
||||||
);
|
|
||||||
} else {
|
if (!result) {
|
||||||
logger.error(
|
logger.debug(
|
||||||
"Direct fetch of page URL failed",
|
"Direct fetch response not accepted, continuing with browser fetch",
|
||||||
{ e, ...logDetails },
|
logDetails,
|
||||||
"fetch",
|
"fetch",
|
||||||
);
|
);
|
||||||
return;
|
} else {
|
||||||
}
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1280,6 +1265,10 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
|
|
||||||
async pageFinished(data: PageState, lastErrorText = "") {
|
async pageFinished(data: PageState, lastErrorText = "") {
|
||||||
|
// not yet finished
|
||||||
|
if (data.asyncLoading) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
// if page loaded, considered page finished successfully
|
// if page loaded, considered page finished successfully
|
||||||
// (even if behaviors timed out)
|
// (even if behaviors timed out)
|
||||||
const { loadState, logDetails, depth, url, pageSkipped } = data;
|
const { loadState, logDetails, depth, url, pageSkipped } = data;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -85,6 +85,7 @@ export class PageState {
|
||||||
|
|
||||||
skipBehaviors = false;
|
skipBehaviors = false;
|
||||||
pageSkipped = false;
|
pageSkipped = false;
|
||||||
|
asyncLoading = false;
|
||||||
filteredFrames: Frame[] = [];
|
filteredFrames: Frame[] = [];
|
||||||
loadState: LoadState = LoadState.FAILED;
|
loadState: LoadState = LoadState.FAILED;
|
||||||
contentCheckAllowed = false;
|
contentCheckAllowed = false;
|
||||||
|
|
|
@ -76,7 +76,7 @@ test("PDF: check that the pages.jsonl file entry contains status code and mime t
|
||||||
expect(pageH.loadState).toBe(2);
|
expect(pageH.loadState).toBe(2);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => {
|
test("PDF: check that CDX contains data from two crawls: one pdf 200, one 301 and one 200, two pageinfo entries", () => {
|
||||||
const filedata = fs.readFileSync(
|
const filedata = fs.readFileSync(
|
||||||
"test-crawls/collections/crawl-pdf/indexes/index.cdxj",
|
"test-crawls/collections/crawl-pdf/indexes/index.cdxj",
|
||||||
{ encoding: "utf-8" },
|
{ encoding: "utf-8" },
|
||||||
|
@ -90,6 +90,7 @@ test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinf
|
||||||
expect(cdxj[0].url).toBe(PDF_HTTP);
|
expect(cdxj[0].url).toBe(PDF_HTTP);
|
||||||
expect(cdxj[0].status).toBe("301");
|
expect(cdxj[0].status).toBe("301");
|
||||||
|
|
||||||
|
// this is duplicated as this is data from two crawls
|
||||||
expect(cdxj[1].url).toBe(PDF);
|
expect(cdxj[1].url).toBe(PDF);
|
||||||
expect(cdxj[1].status).toBe("200");
|
expect(cdxj[1].status).toBe("200");
|
||||||
expect(cdxj[1].mime).toBe("application/pdf");
|
expect(cdxj[1].mime).toBe("application/pdf");
|
||||||
|
@ -149,7 +150,7 @@ test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinf
|
||||||
const lines = filedata.trim().split("\n");
|
const lines = filedata.trim().split("\n");
|
||||||
const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
|
const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
|
||||||
|
|
||||||
expect(cdxj.length).toBe(6);
|
expect(cdxj.length).toBe(5);
|
||||||
|
|
||||||
expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico");
|
expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico");
|
||||||
|
|
||||||
|
@ -157,18 +158,14 @@ test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinf
|
||||||
expect(cdxj[1].status).toBe("200");
|
expect(cdxj[1].status).toBe("200");
|
||||||
expect(cdxj[1].mime).toBe("application/xml");
|
expect(cdxj[1].mime).toBe("application/xml");
|
||||||
|
|
||||||
expect(cdxj[2].url).toBe(XML);
|
expect(cdxj[2].url).toBe(XML_REDIR);
|
||||||
expect(cdxj[2].status).toBe("200");
|
expect(cdxj[2].status).toBe("301");
|
||||||
expect(cdxj[2].mime).toBe("application/xml");
|
|
||||||
|
|
||||||
expect(cdxj[3].url).toBe(XML_REDIR);
|
expect(cdxj[3].url).toBe("urn:pageinfo:" + XML);
|
||||||
expect(cdxj[3].status).toBe("301");
|
expect(cdxj[3].mime).toBe("application/json");
|
||||||
|
|
||||||
expect(cdxj[4].url).toBe("urn:pageinfo:" + XML);
|
expect(cdxj[4].url).toBe("urn:pageinfo:" + XML_REDIR);
|
||||||
expect(cdxj[4].mime).toBe("application/json");
|
expect(cdxj[4].mime).toBe("application/json");
|
||||||
|
|
||||||
expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR);
|
|
||||||
expect(cdxj[5].mime).toBe("application/json");
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue