diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 36a840e0..7fd10b39 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -61,7 +61,7 @@ jobs: run: yarn add -D http-server - name: install py-wacz as root for tests - run: sudo pip install wacz + run: sudo pip install wacz --ignore-installed - name: run all tests as root run: sudo DOCKER_HOST_NAME=172.17.0.1 CI=true yarn test -validate diff --git a/package.json b/package.json index 50fb52fd..b7474f0a 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,7 @@ "dependencies": { "@novnc/novnc": "1.4.0", "@webrecorder/wabac": "^2.20.7", - "browsertrix-behaviors": "^0.6.5", + "browsertrix-behaviors": "^0.6.6", "client-zip": "^2.4.5", "css-selector-parser": "^3.0.5", "fetch-socks": "^1.3.0", @@ -37,7 +37,7 @@ "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "^2.4.2", + "warcio": "^2.4.3", "ws": "^7.4.4", "yargs": "^17.7.2" }, @@ -67,7 +67,7 @@ }, "resolutions": { "wrap-ansi": "7.0.0", - "warcio": "^2.4.2", + "warcio": "^2.4.3", "@novnc/novnc": "1.4.0" } } diff --git a/src/crawler.ts b/src/crawler.ts index d2923508..61f0a2ea 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -46,6 +46,7 @@ import { Browser } from "./util/browser.js"; import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, + FETCH_FUNC, DISPLAY, ExtractSelector, PAGE_OP_TIMEOUT_SECS, @@ -693,6 +694,7 @@ export class Crawler { cdp, workerid, callbacks, + recorder, frameIdToExecId, }: WorkerOpts) { await this.browser.setupPage({ page, cdp }); @@ -767,6 +769,10 @@ self.__bx_behaviors.selectMainBehavior(); this.behaviorsChecked = true; } + await page.exposeFunction(FETCH_FUNC, (url: string) => { + return recorder ? recorder.addExternalFetch(url, cdp) : true; + }); + await this.browser.addInitScript(page, initScript); } } diff --git a/src/util/constants.ts b/src/util/constants.ts index 6d914def..99ea25b3 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -24,6 +24,8 @@ export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; export const BEHAVIOR_LOG_FUNC = "__bx_log"; export const ADD_LINK_FUNC = "__bx_addLink"; +export const FETCH_FUNC = "__bx_fetch"; + export const MAX_DEPTH = 1000000; export const FETCH_HEADERS_TIMEOUT_SECS = 30; diff --git a/src/util/recorder.ts b/src/util/recorder.ts index f045e05c..a10c8175 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -127,6 +127,7 @@ export class Recorder { pendingRequests!: Map; skipIds!: Set; pageInfo!: PageInfoRecord; + mainFrameId: string | null = null; skipRangeUrls!: Map; swTargetId?: string | null; @@ -449,10 +450,14 @@ export class Recorder { { url, ...this.logDetails }, "recorder", ); + reqresp.deleteRange(); + reqresp.requestId = "0"; + const fetcher = new AsyncFetcher({ reqresp, + expectedSize: reqresp.expectedSize ? reqresp.expectedSize : -1, recorder: this, - networkId: requestId, + networkId: "0", }); void this.fetcherQ.add(() => fetcher.load()); return; @@ -574,6 +579,12 @@ export class Recorder { const networkId = params.networkId || requestId; + const reqresp = this.pendingReqResp(networkId); + + if (!reqresp) { + return false; + } + if (responseErrorReason) { logger.warn( "Skipping failed response", @@ -601,24 +612,23 @@ export class Recorder { ); this.removeReqResp(networkId); - const reqresp = new RequestResponseInfo("0"); - reqresp.fillRequest(params.request, params.resourceType); - if (reqresp.requestHeaders) { - delete reqresp.requestHeaders["range"]; - delete reqresp.requestHeaders["Range"]; - } - reqresp.frameId = params.frameId; + if (!reqresp.fetchContinued) { + const reqrespNew = new RequestResponseInfo("0"); + reqrespNew.fillRequest(params.request, params.resourceType); + reqrespNew.deleteRange(); + reqrespNew.frameId = params.frameId; - this.addAsyncFetch( - { - reqresp, - expectedSize: parseInt(range.split("/")[1]), - recorder: this, - networkId: "0", - cdp, - }, - contentLen, - ); + this.addAsyncFetch( + { + reqresp: reqrespNew, + expectedSize: parseInt(range.split("/")[1]), + recorder: this, + networkId: "0", + cdp, + }, + contentLen, + ); + } return false; } else { @@ -651,27 +661,23 @@ export class Recorder { "recorder", ); - const reqresp = new RequestResponseInfo("0"); - reqresp.fillRequest(params.request, params.resourceType); - reqresp.url = filteredUrl; - reqresp.frameId = params.frameId; + if (!reqresp.fetchContinued) { + const reqrespNew = new RequestResponseInfo("0"); + reqrespNew.fillRequest(params.request, params.resourceType); + reqrespNew.url = filteredUrl; + reqrespNew.frameId = params.frameId; - this.addAsyncFetch({ - reqresp, - recorder: this, - networkId: "0", - cdp, - }); + this.addAsyncFetch({ + reqresp: reqrespNew, + recorder: this, + networkId: "0", + cdp, + }); + } return false; } } - const reqresp = this.pendingReqResp(networkId); - - if (!reqresp) { - return false; - } - // indicate that this is intercepted in the page context if (!isBrowserContext) { reqresp.inPageContext = true; @@ -696,6 +702,7 @@ export class Recorder { }); this.pageInfo.ts = reqresp.ts; this.pageInfo.tsStatus = responseStatusCode!; + this.mainFrameId = params.frameId; } reqresp.fillFetchRequestPaused(params); @@ -842,6 +849,22 @@ export class Recorder { void this.fetcherQ.add(() => fetcher.load()); } + addExternalFetch(url: string, cdp: CDPSession) { + const reqresp = new RequestResponseInfo("0"); + reqresp.url = url; + reqresp.method = "GET"; + reqresp.frameId = this.mainFrameId || undefined; + const fetcher = new NetworkLoadStreamAsyncFetcher({ + reqresp, + recorder: this, + cdp, + networkId: "0", + }); + void this.fetcherQ.add(() => fetcher.load()); + // return true if successful + return true; + } + startPage({ pageid, url }: { pageid: string; url: string }) { this.pageid = pageid; this.pageUrl = url; @@ -864,6 +887,7 @@ export class Recorder { counts: { jsErrors: 0 }, tsStatus: 999, }; + this.mainFrameId = null; } addPageRecord(reqresp: RequestResponseInfo) { @@ -1485,6 +1509,7 @@ class AsyncFetcher { logger.warn( "Async fetch: possible response size mismatch", { + type: this.constructor.name, size: reqresp.readSize, expected: reqresp.expectedSize, url, @@ -1492,7 +1517,8 @@ class AsyncFetcher { }, "recorder", ); - if (status === 206) { + if (status === 206 || status === 200) { + void serializer.externalBuffer?.purge(); await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status); return "notfetched"; } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 5548cb5b..8c738a33 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -335,6 +335,13 @@ export class RequestResponseInfo { return this.fromCache && !this.payload; } + deleteRange() { + if (this.requestHeaders) { + delete this.requestHeaders["range"]; + delete this.requestHeaders["Range"]; + } + } + shouldSkipSave() { // skip cached, OPTIONS/HEAD responses, and 304 responses if ( diff --git a/src/util/worker.ts b/src/util/worker.ts index 256d4241..c1e44cfd 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -27,6 +27,7 @@ export type WorkerOpts = { directFetchCapture: | ((request: DirectFetchRequest) => Promise) | null; + recorder: Recorder | null; markPageUsed: () => void; frameIdToExecId: Map; isAuthSet?: boolean; @@ -183,6 +184,7 @@ export class PageWorker { cdp, workerid, callbacks: this.callbacks, + recorder: this.recorder, directFetchCapture, frameIdToExecId: new Map(), markPageUsed: () => { diff --git a/yarn.lock b/yarn.lock index 2b28567d..7a80fa97 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1435,10 +1435,10 @@ browserslist@^4.24.0: node-releases "^2.0.18" update-browserslist-db "^1.1.1" -browsertrix-behaviors@^0.6.5: - version "0.6.5" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.5.tgz#a8e3da231caff8e54e34cac6ed3ff431c68b664d" - integrity sha512-URUMUPdU0O2J8rmgzrzY4BzT8vv/iYNQUf/B1Eif3ntMsMC51R4/MGgOC8d7pUDCfy5tnOkV1FGlhB9A5LLQrw== +browsertrix-behaviors@^0.6.6: + version "0.6.6" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.6.tgz#10bcccfb091c051f5c886d5f69487e6d184078de" + integrity sha512-UPNcU9dV0nAvUwJHKKYCkuqdYdlMjK7AWYDyr4xBpSq55xmEh2wQlwQyDyJuUUUrhJNII4NqXK24hVXPdvf5VA== dependencies: query-selector-shadow-dom "^1.0.1" @@ -5006,10 +5006,10 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.4.0, warcio@^2.4.2: - version "2.4.2" - resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.2.tgz#782d8dcb0769f271b0ae96521fb4969e2570e9b3" - integrity sha512-QYbZ3EGYtnAIrzL7Bajo7ak87pipilpkIfaFIzFQWUX4wuXNuKqnfQy/EAoi2tEIl3VJgsWcL+wjjk4+15MKbQ== +warcio@^2.4.0, warcio@^2.4.2, warcio@^2.4.3: + version "2.4.3" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.3.tgz#37ff95c2358d0d5ddb16e924fe200c4774b3903d" + integrity sha512-c397HNfLE7yJsyVF3XKXB+Yh3q3WKljhdYRPkKF9eyZMtB+HIxj1aBqgq0nTYz492KMKtzygBo0Gx+Gi0fJ9dg== dependencies: "@types/pako" "^1.0.7" "@types/stream-buffers" "^3.0.7"