From 0a309af7402ba4cc487cc41bb30d76c2b8a440c3 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 30 Jun 2022 19:24:26 -0700 Subject: [PATCH] Update to Chrome/Chromium 101 - (0.7.0 Beta 0) (#144) * update base image - switch to browsertrix-base-image:101 with chrome/chromium 101, - includes additional fonts and ubuntu 22.04 as base. - add --disable-site-isolation-trials as default flag to support behaviors accessing iframes * debugging support for shared redis state: - support pausing crawler indefinitely if crawl state is set to 'debug' - must be set/unset manually via external redis - designed for browsertrix-cloud for now bump to 0.7.0-beta.0 --- Dockerfile | 40 +++++----------------------------------- crawler.js | 21 ++++++++++++++++++--- package.json | 2 +- util/browser.js | 1 + 4 files changed, 25 insertions(+), 39 deletions(-) diff --git a/Dockerfile b/Dockerfile index fd3c9d36..f7da313d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,35 +1,10 @@ -ARG BROWSER_VERSION=91 - ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base +ARG BROWSER_VERSION=101 -ARG BROWSER_BIN=google-chrome - -FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} AS browser - -FROM ubuntu:bionic - -# http://bugs.python.org/issue19846 -# > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK. -ENV LANG C.UTF-8 - -RUN apt-get update -y && apt-get install --no-install-recommends -qqy software-properties-common \ - && add-apt-repository -y ppa:deadsnakes \ - && apt-get update -y \ - && apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git socat \ - python3.8 python3.8-distutils python3.8-dev gpg ca-certificates \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \ - && echo "deb https://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \ - && curl -sL https://deb.nodesource.com/setup_16.x -o /tmp/nodesource_setup.sh && bash /tmp/nodesource_setup.sh \ - && apt-get update -y && apt-get install -qqy nodejs yarn \ - && curl https://bootstrap.pypa.io/get-pip.py | python3.8 \ - && pip install 'setuptools<58.0' +FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} # needed to add args to main build stage ARG BROWSER_VERSION -ARG BROWSER_BIN ENV PROXY_HOST=localhost \ PROXY_PORT=8080 \ @@ -38,16 +13,12 @@ ENV PROXY_HOST=localhost \ DISPLAY=:99 \ GEOMETRY=1360x1020x16 \ BROWSER_VERSION=${BROWSER_VERSION} \ - BROWSER_BIN=${BROWSER_BIN} - -COPY --from=browser /deb/*.deb /deb/ -RUN dpkg -i /deb/*.deb; apt-get update; apt-mark hold chromium-browser; apt --fix-broken install -qqy; \ - rm -rf /var/lib/opts/lists/* + BROWSER_BIN=google-chrome WORKDIR /app ADD requirements.txt /app/ -RUN pip install -r requirements.txt +RUN pip install -U setuptools; pip install -r requirements.txt ADD package.json /app/ @@ -62,8 +33,7 @@ ADD util/*.js /app/util/ COPY config.yaml /app/ ADD html/ /app/html/ -RUN ln -s /app/main.js /usr/bin/crawl -RUN ln -s /app/create-login-profile.js /usr/bin/create-login-profile +RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/bin/create-login-profile WORKDIR /crawls diff --git a/crawler.js b/crawler.js index e1f5e7aa..91ad5c1a 100644 --- a/crawler.js +++ b/crawler.js @@ -168,7 +168,6 @@ class Crawler { this.statusLog(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`); this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname()); - await this.crawlState.setStatus("running"); } else { this.statusLog("Storing state in memory"); @@ -292,7 +291,9 @@ class Crawler { } finally { console.log(status); - await this.crawlState.setStatus(status); + if (this.crawlState) { + await this.crawlState.setStatus(status); + } process.exit(this.exitCode); } @@ -447,6 +448,18 @@ class Crawler { return; } + await this.initCrawlState(); + + let initState = await this.crawlState.getStatus(); + + while (initState === "debug") { + console.log("Paused for debugging, will continue after manual resume"); + + await this.sleep(60); + + initState = await this.crawlState.getStatus(); + } + if (this.params.generateWACZ) { this.storage = initStorage(); } @@ -463,7 +476,9 @@ class Crawler { }); - this.cluster.jobQueue = await this.initCrawlState(); + this.cluster.jobQueue = this.crawlState; + + await this.crawlState.setStatus("running"); if (this.params.state) { await this.crawlState.load(this.params.state, this.params.scopedSeeds, true); diff --git a/package.json b/package.json index 1e4b955f..96f35134 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.6.0", + "version": "0.7.0-beta.0", "main": "browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler", "author": "Ilya Kreymer , Webrecorder Software", diff --git a/util/browser.js b/util/browser.js index 109ea404..b980a52b 100644 --- a/util/browser.js +++ b/util/browser.js @@ -94,6 +94,7 @@ module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => { "--enable-features=NetworkService,NetworkServiceInProcess", "--autoplay-policy=no-user-gesture-required", "--disable-features=IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,AcceptCHFrame,AutoExpandDetailsElement", + "--disable-site-isolation-trials", "--disable-popup-blocking", "--disable-backgrounding-occluded-windows", `--user-agent=${userAgent || getDefaultUA()}`,