From 8d7fb1e08476bf37458e60328a5976ba81dda4fe Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 13 Aug 2024 23:38:55 -0700 Subject: [PATCH] 1.2.8 updates: (#668) - rewriting: update wabac.js, use getCustomRewriter(), don't truncate POST request bodies for URLs that use a custom rewriter - browser: disable --enable-automation, setting webdriver = true, so no need for override - deps: update puppeteer-core, necessary changes for latest puppeteer --- Dockerfile | 2 +- package.json | 8 +++--- src/util/browser.ts | 7 +---- src/util/recorder.ts | 12 +++----- src/util/reqresp.ts | 9 ++++-- src/util/screenshots.ts | 2 +- yarn.lock | 63 +++++++++++++++++++++++------------------ 7 files changed, 53 insertions(+), 50 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1ee5c598..f187fdc9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,7 +42,7 @@ ADD config/ /app/ ADD html/ /app/html/ -ARG RWP_VERSION=2.1.3 +ARG RWP_VERSION=2.1.4 ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz diff --git a/package.json b/package.json index 1fe9b57c..1625649b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.2.7", + "version": "1.2.8", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", @@ -18,8 +18,8 @@ "dependencies": { "@novnc/novnc": "^1.4.0", "@types/sax": "^1.2.7", - "@webrecorder/wabac": "^2.19.4", - "browsertrix-behaviors": "^0.6.3", + "@webrecorder/wabac": "^2.19.7", + "browsertrix-behaviors": "^0.6.4", "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", @@ -30,7 +30,7 @@ "p-queue": "^7.3.4", "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", - "puppeteer-core": "^22.14.0", + "puppeteer-core": "^23.0.2", "sax": "^1.3.0", "sharp": "^0.32.6", "tsc": "^2.0.4", diff --git a/src/util/browser.ts b/src/util/browser.ts index 69ed78b1..091834e7 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -113,7 +113,7 @@ export class Browser { headless, executablePath: this.getBrowserExe(), ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"], - ignoreHTTPSErrors: true, + acceptInsecureCerts: true, handleSIGHUP: signals, handleSIGINT: signals, handleSIGTERM: signals, @@ -140,11 +140,6 @@ export class Browser { } async setupPage({ page }: { page: Page; cdp: CDPSession }) { - await this.addInitScript( - page, - 'Object.defineProperty(navigator, "webdriver", {value: false});', - ); - switch (this.swOpt) { case "disabled": logger.debug("Service Workers: always disabled", {}, "browser"); diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 4b59715d..f9481f2a 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -14,11 +14,8 @@ import { import { fetch, Response } from "undici"; -import { - baseRules as baseDSRules, - htmlRules as htmlDSRules, - // @ts-expect-error TODO fill in why error is expected -} from "@webrecorder/wabac/src/rewrite/index.js"; +// @ts-expect-error TODO fill in why error is expected +import { getCustomRewriter } from "@webrecorder/wabac/src/rewrite/index.js"; import { rewriteDASH, rewriteHLS, @@ -1003,10 +1000,9 @@ export class Recorder { case "text/javascript": case "application/javascript": case "application/x-javascript": { - const rules = contentType === "text/html" ? htmlDSRules : baseDSRules; - const rw = rules.getRewriter(url); + const rw = getCustomRewriter(url, isHTMLMime(contentType)); - if (rw !== rules.defaultRewriter) { + if (rw) { string = payload.toString(); newString = rw.rewrite(string, { live: true, save: extraOpts }); } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 5132168e..2713f276 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -1,5 +1,7 @@ // @ts-expect-error TODO fill in why error is expected import { getStatusText } from "@webrecorder/wabac/src/utils.js"; +// @ts-expect-error TODO fill in why error is expected +import { getCustomRewriter } from "@webrecorder/wabac/src/rewrite/index.js"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; @@ -372,8 +374,11 @@ export class RequestResponseInfo { }; if (postToGetUrl(convData)) { - //this.requestBody = convData.requestBody; - // truncate to avoid extra long URLs + // if not custom rewrite, truncate to avoid extra long URLs + if (getCustomRewriter(this.url, isHTMLMime(this.getMimeType() || ""))) { + return convData.url; + } + try { const url = new URL(convData.url); for (const [key, value] of url.searchParams.entries()) { diff --git a/src/util/screenshots.ts b/src/util/screenshots.ts index e04757c1..d794049c 100644 --- a/src/util/screenshots.ts +++ b/src/util/screenshots.ts @@ -70,7 +70,7 @@ export class Screenshots { }); } const options = screenshotTypes[screenshotType]; - const screenshotBuffer = await this.page.screenshot(options); + const screenshotBuffer = Buffer.from(await this.page.screenshot(options)); if (state && screenshotType === "view") { state.screenshotView = screenshotBuffer; } diff --git a/yarn.lock b/yarn.lock index c851b392..85f7ed8a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1300,21 +1300,21 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.19.4": - version "2.19.4" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.4.tgz#6c91a65928413b8394f17b57f57a803dcb111dbe" - integrity sha512-USWUoreSfgyeYYrC2/o2YYr4dCUSwgOSzbpdapqh90VQ4Fb0fjwPAiessBCH4rA5yd9QpOgWdkapDmXvLx6Bww== +"@webrecorder/wabac@^2.19.7": + version "2.19.7" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.7.tgz#3afe48f79752bcd189cffd5d5e6a8dbe4f394053" + integrity sha512-X9UFxWCww1KWDnAaEjg7vpg6SznBov5a88FPxbOvo5yCT/UkJcQHaa0qo1L52l46sIAUnSbsYz1ur9yMd6ygVA== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" "@peculiar/x509" "^1.9.2" - "@webrecorder/wombat" "^3.7.11" + "@webrecorder/wombat" "^3.7.14" acorn "^8.10.0" auto-js-ipfs "^2.1.1" base64-js "^1.5.1" brotli "^1.3.3" buffer "^6.0.3" - fast-xml-parser "^4.4.0" + fast-xml-parser "^4.4.1" hash-wasm "^4.9.0" http-link-header "^1.1.3" http-status-codes "^2.1.4" @@ -1329,10 +1329,10 @@ stream-browserify "^3.0.0" warcio "^2.2.1" -"@webrecorder/wombat@^3.7.11": - version "3.7.11" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.11.tgz#27539f52317b2d80af4f28d971d59b53bc0f2b96" - integrity sha512-WlGpKjHUpP2aZo/OrY5aduNX/TVdo+hSkzu9as/63wSQ4ZFWIqZ+pxYXci43hjV5oVjcMP4KALLq+V+Fuo8qSA== +"@webrecorder/wombat@^3.7.14": + version "3.7.14" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.14.tgz#3779e4cadb256755bbbfd2960805965ec4daacd8" + integrity sha512-sDNH+c8WstQrK91y8kIPJh1XAC2WXLU5rC8wztANzK1mVzA7v6XB5gk3Yp7OIAn4bn1XuGRVjubhKhmxVVZ9kg== dependencies: warcio "^2.2.0" @@ -1677,10 +1677,10 @@ browserslist@^4.22.2: node-releases "^2.0.14" update-browserslist-db "^1.0.13" -browsertrix-behaviors@^0.6.3: - version "0.6.3" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.3.tgz#cdd6457bcc718cc30257fd754a2c12191a6431a2" - integrity sha512-fr9w8ANqmxDid4Ile+dYjwcU5nD4+ZhTBVID2zBYWNoSoFkrEILUtpSAbBmLtr5Ujulxjn71uUQwMOfAFAUqzw== +browsertrix-behaviors@^0.6.4: + version "0.6.4" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.4.tgz#33fe9a433108f2faac3a03af91aff940433e5b87" + integrity sha512-xaiO/VqqeSd5FnAkIKQINxC/q3Med33Lqw3LGxD4NBtkcMSh1Anz/+830QHVlQbp08nIPUXYV96hDrx1Uv0PmQ== dependencies: query-selector-shadow-dom "^1.0.1" @@ -1801,10 +1801,10 @@ chownr@^1.1.1: resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b" integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg== -chromium-bidi@0.6.2: - version "0.6.2" - resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-0.6.2.tgz#91f9daa20984833b52221084480fbe0465b29c67" - integrity sha512-4WVBa6ijmUTVr9cZD4eicQD8Mdy/HCX3bzEIYYpmk0glqYLoWH+LqQEvV9RpDRzoQSbY1KJHloYXbDMXMbDPhg== +chromium-bidi@0.6.4: + version "0.6.4" + resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-0.6.4.tgz#627d76bae2819d59b61a413babe9664e0a16b71d" + integrity sha512-8zoq6ogmhQQkAKZVKO2ObFTl4uOkqoX1PlKQX3hZQ5E9cbUotcAb7h4pTNVAGGv8Z36PF3CtdOriEp/Rz82JqQ== dependencies: mitt "3.0.1" urlpattern-polyfill "10.0.0" @@ -1973,6 +1973,13 @@ debug@^4.3.5: dependencies: ms "2.1.2" +debug@^4.3.6: + version "4.3.6" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.6.tgz#2ab2c38fbaffebf8aa95fdfe6d88438c7a13c52b" + integrity sha512-O/09Bd4Z1fBrU4VzkhFqVgpPzaGbw6Sm9FEkBT1A/YBXQFGuuSxa1dN2nxgxS34JmKXqYx8CZAwEVoJFImUXIg== + dependencies: + ms "2.1.2" + decode-uri-component@^0.2.2: version "0.2.2" resolved "https://registry.yarnpkg.com/decode-uri-component/-/decode-uri-component-0.2.2.tgz#e69dbe25d37941171dd540e024c444cd5188e1e9" @@ -2428,10 +2435,10 @@ fast-xml-parser@^4.2.2: dependencies: strnum "^1.0.5" -fast-xml-parser@^4.4.0: - version "4.4.0" - resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.0.tgz#341cc98de71e9ba9e651a67f41f1752d1441a501" - integrity sha512-kLY3jFlwIYwBNDojclKsNAC12sfD6NwW74QB2CoNGPvtVxjliYehVunB3HYyNi+n4Tt1dAcgwYvmKF/Z18flqg== +fast-xml-parser@^4.4.1: + version "4.4.1" + resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.1.tgz#86dbf3f18edf8739326447bcaac31b4ae7f6514f" + integrity sha512-xkjOecfnKGkSsOwtZ5Pz7Us/T6mrbPQrq0nh+aCO5V9nk5NLWmasAHumTKjiPJPWANe+kAZ84Jc8ooJkzZ88Sw== dependencies: strnum "^1.0.5" @@ -4345,14 +4352,14 @@ punycode@^2.1.0: resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec" integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A== -puppeteer-core@^22.14.0: - version "22.14.0" - resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-22.14.0.tgz#5bb466adba725c966b0a86f0337a476d4c68ebec" - integrity sha512-rl4tOY5LcA3e374GAlsGGHc05HL3eGNf5rZ+uxkl6id9zVZKcwcp1Z+Nd6byb6WPiPeecT/dwz8f/iUm+AZQSw== +puppeteer-core@^23.0.2: + version "23.0.2" + resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-23.0.2.tgz#343c8d003e609620febfe35f76847a0014cdc97c" + integrity sha512-MvOHn+g1TYkAR2oVd/bf/YWXKqFTJmkhyyurYgxkrjh8rBOL1ZH5VyOsLJi0bLO7/yoipAmk1gFZEx9HUJnaoA== dependencies: "@puppeteer/browsers" "2.3.0" - chromium-bidi "0.6.2" - debug "^4.3.5" + chromium-bidi "0.6.4" + debug "^4.3.6" devtools-protocol "0.0.1312386" ws "^8.18.0"