From ec07cc99f10f26a20a5a1b45977bea2fc3398f63 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Fri, 31 May 2024 09:31:41 +0200 Subject: [PATCH] WIP 1 --- package.json | 3 ++ src/util/browser.ts | 3 +- src/util/recorder.ts | 22 ++++++++++- src/util/reqresp.ts | 2 + yarn.lock | 91 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 117 insertions(+), 4 deletions(-) diff --git a/package.json b/package.json index f05071eb..1afd24ad 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ }, "dependencies": { "@novnc/novnc": "^1.4.0", + "@types/node-fetch": "^2.6.11", "@types/sax": "^1.2.7", "@webrecorder/wabac": "^2.16.12", "browsertrix-behaviors": "^0.6.0", @@ -27,12 +28,14 @@ "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", "minio": "^7.1.3", + "node-fetch": "^3.3.2", "p-queue": "^7.3.4", "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", "puppeteer-core": "^22.6.1", "sax": "^1.3.0", "sharp": "^0.32.6", + "socks-proxy-agent": "^8.0.3", "tsc": "^2.0.4", "uuid": "8.3.2", "warcio": "^2.2.1", diff --git a/src/util/browser.ts b/src/util/browser.ts index 8b8ac980..22579e95 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -115,7 +115,7 @@ export class Browser { ? undefined : (target) => this.targetFilter(target), }; - + logger.info("Launching browser", launchOpts); await this._init(launchOpts, ondisconnect, recording); } @@ -237,6 +237,7 @@ export class Browser { ]; proxy = proxy || this.getProxy(); + logger.info(`Proxy settings: ${proxy}`); if (proxy) { args.push("--ignore-certificate-errors"); diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 182571ea..b193bcea 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -8,6 +8,13 @@ import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; +import { SocksProxyAgent } from "socks-proxy-agent"; + +import fetch, { Response } from "node-fetch"; + +import { default as stream } from "node:stream"; +import type { ReadableStream } from "node:stream/web"; + // @ts-expect-error TODO fill in why error is expected import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; import { @@ -1165,6 +1172,8 @@ class AsyncFetcher { manualRedirect = false; + socksAgent: SocksProxyAgent | null = null; + constructor({ tempdir, reqresp, @@ -1195,6 +1204,10 @@ class AsyncFetcher { this.maxFetchSize = maxFetchSize; this.manualRedirect = manualRedirect; + + if (process.env.PROXY_SERVER) { + this.socksAgent = new SocksProxyAgent(process.env.PROXY_SERVER); + } } async load() { @@ -1361,6 +1374,7 @@ class AsyncFetcher { body: reqresp.postData || undefined, signal, redirect: this.manualRedirect ? "manual" : "follow", + agent: this.socksAgent || undefined, }); if (this.filter && !this.filter(resp) && abort) { @@ -1386,10 +1400,14 @@ class AsyncFetcher { reqresp.fillFetchResponse(resp); - return this.takeReader(resp.body.getReader()); + const reader = stream.Readable.fromWeb( + resp.body as unknown as ReadableStream, + ); + + return this.takeReader(reader); } - async *takeReader(reader: ReadableStreamDefaultReader) { + async *takeReader(reader: stream.Readable) { let size = 0; try { while (true) { diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 1fc9fbe5..515e6517 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -5,6 +5,8 @@ import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; import { HTML_TYPES } from "./constants.js"; +import { Response } from "node-fetch"; + const CONTENT_LENGTH = "content-length"; const CONTENT_TYPE = "content-type"; const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"]; diff --git a/yarn.lock b/yarn.lock index 07506669..d7f034b4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1136,6 +1136,14 @@ resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841" integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA== +"@types/node-fetch@^2.6.11": + version "2.6.11" + resolved "https://registry.yarnpkg.com/@types/node-fetch/-/node-fetch-2.6.11.tgz#9b39b78665dae0e82a08f02f4967d62c66f95d24" + integrity sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g== + dependencies: + "@types/node" "*" + form-data "^4.0.0" + "@types/node@*": version "15.3.0" resolved "https://registry.yarnpkg.com/@types/node/-/node-15.3.0.tgz#d6fed7d6bc6854306da3dea1af9f874b00783e26" @@ -1363,6 +1371,13 @@ agent-base@^7.0.2, agent-base@^7.1.0: dependencies: debug "^4.3.4" +agent-base@^7.1.1: + version "7.1.1" + resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-7.1.1.tgz#bdbded7dfb096b751a2a087eeeb9664725b2e317" + integrity sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA== + dependencies: + debug "^4.3.4" + ajv@^6.12.4: version "6.12.6" resolved "https://registry.yarnpkg.com/ajv/-/ajv-6.12.6.tgz#baf5a62e802b07d977034586f8c3baf5adf26df4" @@ -1476,6 +1491,11 @@ async@^3.2.4: resolved "https://registry.yarnpkg.com/async/-/async-3.2.4.tgz#2d22e00f8cddeb5fde5dd33522b56d1cf569a81c" integrity sha512-iAB+JbDEGXhyIUavoDl9WP/Jj106Kz9DEn1DPgYw5ruDn0e3Wgi3sKFm55sASdGBNOQB8F59d9qQ7deqrHA8wQ== +asynckit@^0.4.0: + version "0.4.0" + resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79" + integrity sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q== + auto-js-ipfs@^2.1.1: version "2.3.0" resolved "https://registry.yarnpkg.com/auto-js-ipfs/-/auto-js-ipfs-2.3.0.tgz#2c2684074cdaa2eb579345c4f86420d7635956c8" @@ -1849,6 +1869,13 @@ color@^4.2.3: color-convert "^2.0.1" color-string "^1.9.0" +combined-stream@^1.0.8: + version "1.0.8" + resolved "https://registry.yarnpkg.com/combined-stream/-/combined-stream-1.0.8.tgz#c3d45a8b34fd730631a110a8a2520682b31d5a7f" + integrity sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg== + dependencies: + delayed-stream "~1.0.0" + concat-map@0.0.1: version "0.0.1" resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" @@ -1905,6 +1932,11 @@ crypto-random-string@^4.0.0: dependencies: type-fest "^1.0.1" +data-uri-to-buffer@^4.0.0: + version "4.0.1" + resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz#d8feb2b2881e6a4f58c2e08acfd0e2834e26222e" + integrity sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A== + data-uri-to-buffer@^5.0.1: version "5.0.1" resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c" @@ -1972,6 +2004,11 @@ degenerator@^5.0.0: escodegen "^2.1.0" esprima "^4.0.1" +delayed-stream@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/delayed-stream/-/delayed-stream-1.0.0.tgz#df3ae199acadfb7d440aaae0b29e2272b24ec619" + integrity sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ== + denque@^2.1.0: version "2.1.0" resolved "https://registry.yarnpkg.com/denque/-/denque-2.1.0.tgz#e93e1a6569fb5e66f16a3c2a2964617d349d6ab1" @@ -2386,6 +2423,14 @@ fd-slicer@~1.1.0: dependencies: pend "~1.2.0" +fetch-blob@^3.1.2, fetch-blob@^3.1.4: + version "3.2.0" + resolved "https://registry.yarnpkg.com/fetch-blob/-/fetch-blob-3.2.0.tgz#f09b8d4bbd45adc6f0c20b7e787e793e309dcce9" + integrity sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ== + dependencies: + node-domexception "^1.0.0" + web-streams-polyfill "^3.0.3" + file-entry-cache@^6.0.1: version "6.0.1" resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027" @@ -2439,6 +2484,22 @@ foreach@^2.0.5: resolved "https://registry.yarnpkg.com/foreach/-/foreach-2.0.5.tgz#0bee005018aeb260d0a3af3ae658dd0136ec1b99" integrity sha1-C+4AUBiusmDQo6865ljdATbsG5k= +form-data@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.0.tgz#93919daeaf361ee529584b9b31664dc12c9fa452" + integrity sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww== + dependencies: + asynckit "^0.4.0" + combined-stream "^1.0.8" + mime-types "^2.1.12" + +formdata-polyfill@^4.0.10: + version "4.0.10" + resolved "https://registry.yarnpkg.com/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz#24807c31c9d402e002ab3d8c720144ceb8848423" + integrity sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g== + dependencies: + fetch-blob "^3.1.2" + fs-constants@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/fs-constants/-/fs-constants-1.0.0.tgz#6be0de9be998ce16af8afc24497b9ee9b7ccd9ad" @@ -3606,7 +3667,7 @@ mime-db@1.52.0: resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.52.0.tgz#bbabcdc02859f4987301c856e3387ce5ec43bf70" integrity sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg== -mime-types@^2.1.35: +mime-types@^2.1.12, mime-types@^2.1.35: version "2.1.35" resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.35.tgz#381a871b62a734450660ae3deee44813f70d959a" integrity sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw== @@ -3704,6 +3765,20 @@ node-addon-api@^6.1.0: resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-6.1.0.tgz#ac8470034e58e67d0c6f1204a18ae6995d9c0d76" integrity sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA== +node-domexception@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/node-domexception/-/node-domexception-1.0.0.tgz#6888db46a1f71c0b76b3f7555016b63fe64766e5" + integrity sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ== + +node-fetch@^3.3.2: + version "3.3.2" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-3.3.2.tgz#d1e889bacdf733b4ff3b2b243eb7a12866a0b78b" + integrity sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA== + dependencies: + data-uri-to-buffer "^4.0.0" + fetch-blob "^3.1.4" + formdata-polyfill "^4.0.10" + node-int64@^0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/node-int64/-/node-int64-0.4.0.tgz#87a9065cdb355d3182d8f94ce11188b825c68a3b" @@ -4429,6 +4504,15 @@ socks-proxy-agent@^8.0.2: debug "^4.3.4" socks "^2.7.1" +socks-proxy-agent@^8.0.3: + version "8.0.3" + resolved "https://registry.yarnpkg.com/socks-proxy-agent/-/socks-proxy-agent-8.0.3.tgz#6b2da3d77364fde6292e810b496cb70440b9b89d" + integrity sha512-VNegTZKhuGq5vSD6XNKlbqWhyt/40CgoEw8XxD6dhnm8Jq9IEa3nIa4HwnM8XOqU0CdB0BwWVXusqiFXfHB3+A== + dependencies: + agent-base "^7.1.1" + debug "^4.3.4" + socks "^2.7.1" + socks@^2.7.1: version "2.7.1" resolved "https://registry.yarnpkg.com/socks/-/socks-2.7.1.tgz#d8e651247178fde79c0663043e07240196857d55" @@ -4958,6 +5042,11 @@ web-encoding@^1.1.5: optionalDependencies: "@zxing/text-encoding" "0.9.0" +web-streams-polyfill@^3.0.3: + version "3.3.3" + resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz#2073b91a2fdb1fbfbd401e7de0ac9f8214cecb4b" + integrity sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw== + which-boxed-primitive@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6"