mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Fix 206 response + general video handling (#646)
Refactors handling of 206 responses: - If a 206 response is encountered, and its actually the full range, convert to 200 and rewrite range and content-range headers to x-range and x-orig-range. This is to support rewriting of 206 responses for DASH manifests - If a partial 206 response starting with `0-`, do a full async fetch separately. - If a partial 206 response not starting with 0-, just ignore (very likely a duplicate picked up when handling the 0- response) - Don't stream content-types that can be rewritten, since streaming prevents rewriting. Fixes rewriting on DASH/HLS manifests which have no content-length and don't get properly rewritten. - Overall, adds missing rewriting of DASH/HLS manifests that have no content-length and are served as 206. - Update to latest wabac.js which fixes rewriting of DASH manifest to avoid duplicate '<?xml' prefix, webrecorder/wabac.js#192 - Fixes #645
This commit is contained in:
parent
01666b4474
commit
88a2fbd0a0
6 changed files with 128 additions and 59 deletions
|
@ -42,7 +42,7 @@ ADD config/ /app/
|
|||
|
||||
ADD html/ /app/html/
|
||||
|
||||
ARG RWP_VERSION=2.1.1
|
||||
ARG RWP_VERSION=2.1.2
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "1.2.4",
|
||||
"version": "1.2.5",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
@ -18,7 +18,7 @@
|
|||
"dependencies": {
|
||||
"@novnc/novnc": "^1.4.0",
|
||||
"@types/sax": "^1.2.7",
|
||||
"@webrecorder/wabac": "^2.19.1",
|
||||
"@webrecorder/wabac": "^2.19.4",
|
||||
"browsertrix-behaviors": "^0.6.2",
|
||||
"crc": "^4.3.2",
|
||||
"fetch-socks": "^1.3.0",
|
||||
|
|
|
@ -20,7 +20,7 @@ export function formatErr(e: unknown): Record<string, any> {
|
|||
} else if (typeof e === "object") {
|
||||
return e || {};
|
||||
} else {
|
||||
return { message: (e as object).toString() };
|
||||
return { message: (e as object) + "" };
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -45,6 +45,17 @@ const WRITE_DUPE_KEY = "s:writedupe";
|
|||
|
||||
const MIME_EVENT_STREAM = "text/event-stream";
|
||||
|
||||
const RW_MIME_TYPES = [
|
||||
"application/x-mpegURL",
|
||||
"application/vnd.apple.mpegurl",
|
||||
"application/dash+xml",
|
||||
"text/html",
|
||||
"application/json",
|
||||
"text/javascript",
|
||||
"application/javascript",
|
||||
"application/x-javascript",
|
||||
];
|
||||
|
||||
const encoder = new TextEncoder();
|
||||
|
||||
// =================================================================
|
||||
|
@ -76,7 +87,6 @@ export type PageInfoRecord = {
|
|||
|
||||
// =================================================================
|
||||
export type AsyncFetchOptions = {
|
||||
tempdir: string;
|
||||
reqresp: RequestResponseInfo;
|
||||
expectedSize?: number;
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
|
@ -135,8 +145,6 @@ export class Recorder {
|
|||
logDetails: Record<string, any> = {};
|
||||
skipping = false;
|
||||
|
||||
allowFull206 = false;
|
||||
|
||||
tempdir: string;
|
||||
|
||||
gzip = true;
|
||||
|
@ -439,7 +447,6 @@ export class Recorder {
|
|||
"recorder",
|
||||
);
|
||||
const fetcher = new AsyncFetcher({
|
||||
tempdir: this.tempdir,
|
||||
reqresp,
|
||||
recorder: this,
|
||||
networkId: requestId,
|
||||
|
@ -572,15 +579,40 @@ export class Recorder {
|
|||
|
||||
if (responseStatusCode === 206) {
|
||||
const range = this._getContentRange(responseHeaders);
|
||||
if (
|
||||
this.allowFull206 &&
|
||||
range === `bytes 0-${contentLen - 1}/${contentLen}`
|
||||
) {
|
||||
if (range === `bytes 0-${contentLen - 1}/${contentLen}`) {
|
||||
logger.debug(
|
||||
"Keep 206 Response, Full Range",
|
||||
{ range, contentLen, url, networkId, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
} else if (range?.startsWith("bytes 0-")) {
|
||||
logger.debug(
|
||||
"Re-request 206 Response without range",
|
||||
{ range, contentLen, url, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
this.removeReqResp(networkId);
|
||||
|
||||
const reqresp = new RequestResponseInfo("0");
|
||||
reqresp.fillRequest(params.request, params.resourceType);
|
||||
if (reqresp.requestHeaders) {
|
||||
delete reqresp.requestHeaders["range"];
|
||||
delete reqresp.requestHeaders["Range"];
|
||||
}
|
||||
reqresp.frameId = params.frameId;
|
||||
|
||||
this.addAsyncFetch(
|
||||
{
|
||||
reqresp,
|
||||
expectedSize: parseInt(range.split("/")[1]),
|
||||
recorder: this,
|
||||
networkId: "0",
|
||||
cdp,
|
||||
},
|
||||
contentLen,
|
||||
);
|
||||
|
||||
return false;
|
||||
} else {
|
||||
logger.debug(
|
||||
"Skip 206 Response",
|
||||
|
@ -624,16 +656,17 @@ export class Recorder {
|
|||
return false;
|
||||
}
|
||||
|
||||
const mimeType = this.getMimeType(responseHeaders) || "";
|
||||
|
||||
let streamingConsume = false;
|
||||
|
||||
// if contentLength is large or unknown, do streaming, unless its an essential resource
|
||||
// in which case, need to do a full fetch either way
|
||||
if (
|
||||
(contentLen < 0 || contentLen > MAX_BROWSER_DEFAULT_FETCH_SIZE) &&
|
||||
!this.isEssentialResource(reqresp.resourceType)
|
||||
!this.isEssentialResource(reqresp.resourceType, mimeType)
|
||||
) {
|
||||
const opts: ResponseStreamAsyncFetchOptions = {
|
||||
tempdir: this.tempdir,
|
||||
reqresp,
|
||||
expectedSize: contentLen,
|
||||
recorder: this,
|
||||
|
@ -659,14 +692,7 @@ export class Recorder {
|
|||
|
||||
// if not consumed via takeStream, attempt async loading
|
||||
if (!streamingConsume) {
|
||||
let fetcher: AsyncFetcher;
|
||||
|
||||
if (reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
|
||||
fetcher = new AsyncFetcher(opts);
|
||||
} else {
|
||||
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
|
||||
}
|
||||
this.fetcherQ.add(() => fetcher.load());
|
||||
this.addAsyncFetch(opts, contentLen);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
|
@ -698,7 +724,7 @@ export class Recorder {
|
|||
}
|
||||
}
|
||||
|
||||
const rewritten = await this.rewriteResponse(reqresp, responseHeaders);
|
||||
const rewritten = await this.rewriteResponse(reqresp, mimeType);
|
||||
|
||||
// if in service worker, serialize here
|
||||
// as won't be getting a loadingFinished message
|
||||
|
@ -746,6 +772,17 @@ export class Recorder {
|
|||
return true;
|
||||
}
|
||||
|
||||
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) {
|
||||
let fetcher: AsyncFetcher;
|
||||
|
||||
if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
|
||||
fetcher = new AsyncFetcher(opts);
|
||||
} else {
|
||||
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
|
||||
}
|
||||
this.fetcherQ.add(() => fetcher.load());
|
||||
}
|
||||
|
||||
startPage({ pageid, url }: { pageid: string; url: string }) {
|
||||
this.pageid = pageid;
|
||||
this.pageUrl = url;
|
||||
|
@ -927,10 +964,7 @@ export class Recorder {
|
|||
return false;
|
||||
}
|
||||
|
||||
async rewriteResponse(
|
||||
reqresp: RequestResponseInfo,
|
||||
responseHeaders?: Protocol.Fetch.HeaderEntry[],
|
||||
) {
|
||||
async rewriteResponse(reqresp: RequestResponseInfo, contentType: string) {
|
||||
const { url, extraOpts, payload } = reqresp;
|
||||
|
||||
// don't rewrite if payload is missing or too big
|
||||
|
@ -941,8 +975,6 @@ export class Recorder {
|
|||
let newString = null;
|
||||
let string = null;
|
||||
|
||||
const contentType = this._getContentType(responseHeaders);
|
||||
|
||||
switch (contentType) {
|
||||
case "application/x-mpegURL":
|
||||
case "application/vnd.apple.mpegurl":
|
||||
|
@ -983,17 +1015,26 @@ export class Recorder {
|
|||
"recorder",
|
||||
);
|
||||
reqresp.payload = encoder.encode(newString);
|
||||
reqresp.isRemoveRange = true;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
isEssentialResource(resourceType: string | undefined) {
|
||||
return ["document", "stylesheet", "script"].includes(resourceType || "");
|
||||
isEssentialResource(resourceType: string | undefined, contentType: string) {
|
||||
if (["document", "stylesheet", "script"].includes(resourceType || "")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (RW_MIME_TYPES.includes(contentType)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
_getContentType(
|
||||
protected getMimeType(
|
||||
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
|
||||
) {
|
||||
if (!headers) {
|
||||
|
@ -1008,7 +1049,7 @@ export class Recorder {
|
|||
return null;
|
||||
}
|
||||
|
||||
_getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
|
||||
protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
|
||||
if (!headers) {
|
||||
return -1;
|
||||
}
|
||||
|
@ -1120,7 +1161,7 @@ export class Recorder {
|
|||
!isRedirectStatus(status) &&
|
||||
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
|
||||
) {
|
||||
logNetwork("Skipping dupe", { url });
|
||||
logNetwork("Skipping dupe", { url, status, ...this.logDetails });
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1173,7 +1214,6 @@ export class Recorder {
|
|||
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
|
||||
// should not get here, as dupe pages tracked via seen list
|
||||
const fetcher = new AsyncFetcher({
|
||||
tempdir: this.tempdir,
|
||||
reqresp,
|
||||
recorder: this,
|
||||
networkId: "0",
|
||||
|
@ -1231,7 +1271,6 @@ class AsyncFetcher {
|
|||
manualRedirect = false;
|
||||
|
||||
constructor({
|
||||
tempdir,
|
||||
reqresp,
|
||||
expectedSize = -1,
|
||||
recorder,
|
||||
|
@ -1251,7 +1290,7 @@ class AsyncFetcher {
|
|||
|
||||
this.recorder = recorder;
|
||||
|
||||
this.tempdir = tempdir;
|
||||
this.tempdir = recorder.tempdir;
|
||||
this.filename = path.join(
|
||||
this.tempdir,
|
||||
`${timestampNow()}-${uuidv4()}.data`,
|
||||
|
@ -1604,7 +1643,7 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
|
|||
return;
|
||||
}
|
||||
|
||||
reqresp.status = httpStatusCode || 0;
|
||||
reqresp.setStatus(httpStatusCode || 200);
|
||||
reqresp.responseHeaders = headers || {};
|
||||
|
||||
return this.takeStreamIter(cdp, stream);
|
||||
|
@ -1618,6 +1657,10 @@ function createResponse(
|
|||
pageid: string,
|
||||
contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>,
|
||||
) {
|
||||
if (reqresp.isRemoveRange && reqresp.status === 206) {
|
||||
reqresp.setStatus(200);
|
||||
}
|
||||
|
||||
const url = reqresp.url;
|
||||
const warcVersion = "WARC/1.1";
|
||||
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
|
||||
|
|
|
@ -7,6 +7,8 @@ import { HTML_TYPES } from "./constants.js";
|
|||
import { Response } from "undici";
|
||||
|
||||
const CONTENT_LENGTH = "content-length";
|
||||
const CONTENT_RANGE = "content-range";
|
||||
const RANGE = "range";
|
||||
const CONTENT_TYPE = "content-type";
|
||||
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
||||
|
||||
|
@ -46,6 +48,7 @@ export class RequestResponseInfo {
|
|||
responseHeadersText?: string;
|
||||
|
||||
payload?: Uint8Array;
|
||||
isRemoveRange = false;
|
||||
|
||||
// misc
|
||||
fromServiceWorker = false;
|
||||
|
@ -76,11 +79,17 @@ export class RequestResponseInfo {
|
|||
this.requestId = requestId;
|
||||
}
|
||||
|
||||
setStatus(status: number, statusText?: string) {
|
||||
this.status = status;
|
||||
this.statusText = statusText || getStatusText(this.status);
|
||||
}
|
||||
|
||||
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
|
||||
this.fillRequest(params.request, params.resourceType);
|
||||
|
||||
this.status = params.responseStatusCode || 0;
|
||||
this.statusText = params.responseStatusText || getStatusText(this.status);
|
||||
if (params.responseStatusCode) {
|
||||
this.setStatus(params.responseStatusCode, params.responseStatusText);
|
||||
}
|
||||
|
||||
this.responseHeadersList = params.responseHeaders;
|
||||
|
||||
|
@ -116,8 +125,7 @@ export class RequestResponseInfo {
|
|||
|
||||
this.url = response.url.split("#")[0];
|
||||
|
||||
this.status = response.status;
|
||||
this.statusText = response.statusText || getStatusText(this.status);
|
||||
this.setStatus(response.status, response.statusText);
|
||||
|
||||
this.protocol = response.protocol;
|
||||
|
||||
|
@ -182,8 +190,7 @@ export class RequestResponseInfo {
|
|||
|
||||
fillFetchResponse(response: Response) {
|
||||
this.responseHeaders = Object.fromEntries(response.headers);
|
||||
this.status = response.status;
|
||||
this.statusText = response.statusText || getStatusText(this.status);
|
||||
this.setStatus(response.status, response.statusText);
|
||||
}
|
||||
|
||||
fillRequestExtraInfo(
|
||||
|
@ -240,7 +247,11 @@ export class RequestResponseInfo {
|
|||
headersDict[headerName] = "" + actualContentLength;
|
||||
continue;
|
||||
}
|
||||
if (EXCLUDE_HEADERS.includes(headerName)) {
|
||||
if (
|
||||
EXCLUDE_HEADERS.includes(headerName) ||
|
||||
(this.isRemoveRange &&
|
||||
(headerName === CONTENT_RANGE || headerName === RANGE))
|
||||
) {
|
||||
headerName = "x-orig-" + headerName;
|
||||
}
|
||||
headersDict[headerName] = this._encodeHeaderValue(header.value);
|
||||
|
@ -263,7 +274,11 @@ export class RequestResponseInfo {
|
|||
}
|
||||
const value = this._encodeHeaderValue(headersDict[key]);
|
||||
|
||||
if (EXCLUDE_HEADERS.includes(keyLower)) {
|
||||
if (
|
||||
EXCLUDE_HEADERS.includes(keyLower) ||
|
||||
(this.isRemoveRange &&
|
||||
(keyLower === CONTENT_RANGE || keyLower === RANGE))
|
||||
) {
|
||||
headersDict["x-orig-" + key] = value;
|
||||
delete headersDict[key];
|
||||
} else {
|
||||
|
@ -316,11 +331,11 @@ export class RequestResponseInfo {
|
|||
}
|
||||
|
||||
shouldSkipSave() {
|
||||
// skip cached, OPTIONS/HEAD responses, and 304 or 206 responses
|
||||
// skip cached, OPTIONS/HEAD responses, and 304 responses
|
||||
if (
|
||||
this.fromCache ||
|
||||
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
|
||||
[206, 304].includes(this.status)
|
||||
this.status == 304
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
@ -330,6 +345,17 @@ export class RequestResponseInfo {
|
|||
return true;
|
||||
}
|
||||
|
||||
if (this.status === 206) {
|
||||
const headers = new Headers(this.getResponseHeadersDict());
|
||||
const contentLength: number = parseInt(
|
||||
headers.get(CONTENT_LENGTH) || "0",
|
||||
);
|
||||
const contentRange = headers.get(CONTENT_RANGE);
|
||||
if (contentRange !== `bytes 0-${contentLength - 1}/${contentLength}`) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
22
yarn.lock
22
yarn.lock
|
@ -1300,21 +1300,21 @@
|
|||
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
|
||||
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
|
||||
|
||||
"@webrecorder/wabac@^2.19.1":
|
||||
version "2.19.1"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.1.tgz#ce0d609f9e90c708af99945e1fa338be0ba2b5f9"
|
||||
integrity sha512-m8Fi70OkhzkicbcbN5TrrBpj5D/EZKzVp5905kGPoC2F2zLqxUDMzx1FOHt2sTO/1b9NMvBmw9Pk1JQyYEm6rA==
|
||||
"@webrecorder/wabac@^2.19.4":
|
||||
version "2.19.4"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.4.tgz#6c91a65928413b8394f17b57f57a803dcb111dbe"
|
||||
integrity sha512-USWUoreSfgyeYYrC2/o2YYr4dCUSwgOSzbpdapqh90VQ4Fb0fjwPAiessBCH4rA5yd9QpOgWdkapDmXvLx6Bww==
|
||||
dependencies:
|
||||
"@peculiar/asn1-ecc" "^2.3.4"
|
||||
"@peculiar/asn1-schema" "^2.3.3"
|
||||
"@peculiar/x509" "^1.9.2"
|
||||
"@webrecorder/wombat" "^3.7.8"
|
||||
"@webrecorder/wombat" "^3.7.11"
|
||||
acorn "^8.10.0"
|
||||
auto-js-ipfs "^2.1.1"
|
||||
base64-js "^1.5.1"
|
||||
brotli "^1.3.3"
|
||||
buffer "^6.0.3"
|
||||
fast-xml-parser "^4.2.5"
|
||||
fast-xml-parser "^4.4.0"
|
||||
hash-wasm "^4.9.0"
|
||||
http-link-header "^1.1.3"
|
||||
http-status-codes "^2.1.4"
|
||||
|
@ -1329,10 +1329,10 @@
|
|||
stream-browserify "^3.0.0"
|
||||
warcio "^2.2.1"
|
||||
|
||||
"@webrecorder/wombat@^3.7.8":
|
||||
version "3.7.8"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.8.tgz#a414278b6fbd99bc02a97e384f0373307e60d9fa"
|
||||
integrity sha512-BmEHrvGLHPQtECmCK9Oz7G3p2StsyaFOlNmAMDSNK/GjqPH+UWZOqDryBkWryTh+pFZXKblqyotLtvR4YxVyeQ==
|
||||
"@webrecorder/wombat@^3.7.11":
|
||||
version "3.7.11"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.11.tgz#27539f52317b2d80af4f28d971d59b53bc0f2b96"
|
||||
integrity sha512-WlGpKjHUpP2aZo/OrY5aduNX/TVdo+hSkzu9as/63wSQ4ZFWIqZ+pxYXci43hjV5oVjcMP4KALLq+V+Fuo8qSA==
|
||||
dependencies:
|
||||
warcio "^2.2.0"
|
||||
|
||||
|
@ -2431,7 +2431,7 @@ fast-xml-parser@^4.2.2:
|
|||
dependencies:
|
||||
strnum "^1.0.5"
|
||||
|
||||
fast-xml-parser@^4.2.5:
|
||||
fast-xml-parser@^4.4.0:
|
||||
version "4.4.0"
|
||||
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.0.tgz#341cc98de71e9ba9e651a67f41f1752d1441a501"
|
||||
integrity sha512-kLY3jFlwIYwBNDojclKsNAC12sfD6NwW74QB2CoNGPvtVxjliYehVunB3HYyNi+n4Tt1dAcgwYvmKF/Z18flqg==
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue