Update to Chrome/Chromium 101 - (0.7.0 Beta 0) (#144)

* update base image 
- switch to browsertrix-base-image:101 with chrome/chromium 101,
- includes additional fonts and ubuntu 22.04 as base.
- add --disable-site-isolation-trials as default flag to support behaviors accessing iframes

* debugging support for shared redis state:
- support pausing crawler indefinitely if crawl state is set to 'debug'
- must be set/unset manually via external redis
- designed for browsertrix-cloud for now

bump to 0.7.0-beta.0
This commit is contained in:
Ilya Kreymer 2022-06-30 19:24:26 -07:00 committed by GitHub
parent cf90304fa7
commit 0a309af740
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 25 additions and 39 deletions

View file

@ -1,35 +1,10 @@
ARG BROWSER_VERSION=91
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base
ARG BROWSER_VERSION=101
ARG BROWSER_BIN=google-chrome FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} AS browser
FROM ubuntu:bionic
# http://bugs.python.org/issue19846
# > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK.
ENV LANG C.UTF-8
RUN apt-get update -y && apt-get install --no-install-recommends -qqy software-properties-common \
&& add-apt-repository -y ppa:deadsnakes \
&& apt-get update -y \
&& apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git socat \
python3.8 python3.8-distutils python3.8-dev gpg ca-certificates \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \
&& echo "deb https://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \
&& curl -sL https://deb.nodesource.com/setup_16.x -o /tmp/nodesource_setup.sh && bash /tmp/nodesource_setup.sh \
&& apt-get update -y && apt-get install -qqy nodejs yarn \
&& curl https://bootstrap.pypa.io/get-pip.py | python3.8 \
&& pip install 'setuptools<58.0'
# needed to add args to main build stage # needed to add args to main build stage
ARG BROWSER_VERSION ARG BROWSER_VERSION
ARG BROWSER_BIN
ENV PROXY_HOST=localhost \ ENV PROXY_HOST=localhost \
PROXY_PORT=8080 \ PROXY_PORT=8080 \
@ -38,16 +13,12 @@ ENV PROXY_HOST=localhost \
DISPLAY=:99 \ DISPLAY=:99 \
GEOMETRY=1360x1020x16 \ GEOMETRY=1360x1020x16 \
BROWSER_VERSION=${BROWSER_VERSION} \ BROWSER_VERSION=${BROWSER_VERSION} \
BROWSER_BIN=${BROWSER_BIN} BROWSER_BIN=google-chrome
COPY --from=browser /deb/*.deb /deb/
RUN dpkg -i /deb/*.deb; apt-get update; apt-mark hold chromium-browser; apt --fix-broken install -qqy; \
rm -rf /var/lib/opts/lists/*
WORKDIR /app WORKDIR /app
ADD requirements.txt /app/ ADD requirements.txt /app/
RUN pip install -r requirements.txt RUN pip install -U setuptools; pip install -r requirements.txt
ADD package.json /app/ ADD package.json /app/
@ -62,8 +33,7 @@ ADD util/*.js /app/util/
COPY config.yaml /app/ COPY config.yaml /app/
ADD html/ /app/html/ ADD html/ /app/html/
RUN ln -s /app/main.js /usr/bin/crawl RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/bin/create-login-profile
RUN ln -s /app/create-login-profile.js /usr/bin/create-login-profile
WORKDIR /crawls WORKDIR /crawls

View file

@ -168,7 +168,6 @@ class Crawler {
this.statusLog(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`); this.statusLog(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`);
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname()); this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname());
await this.crawlState.setStatus("running");
} else { } else {
this.statusLog("Storing state in memory"); this.statusLog("Storing state in memory");
@ -292,7 +291,9 @@ class Crawler {
} finally { } finally {
console.log(status); console.log(status);
if (this.crawlState) {
await this.crawlState.setStatus(status); await this.crawlState.setStatus(status);
}
process.exit(this.exitCode); process.exit(this.exitCode);
} }
@ -447,6 +448,18 @@ class Crawler {
return; return;
} }
await this.initCrawlState();
let initState = await this.crawlState.getStatus();
while (initState === "debug") {
console.log("Paused for debugging, will continue after manual resume");
await this.sleep(60);
initState = await this.crawlState.getStatus();
}
if (this.params.generateWACZ) { if (this.params.generateWACZ) {
this.storage = initStorage(); this.storage = initStorage();
} }
@ -463,7 +476,9 @@ class Crawler {
}); });
this.cluster.jobQueue = await this.initCrawlState(); this.cluster.jobQueue = this.crawlState;
await this.crawlState.setStatus("running");
if (this.params.state) { if (this.params.state) {
await this.crawlState.load(this.params.state, this.params.scopedSeeds, true); await this.crawlState.load(this.params.state, this.params.scopedSeeds, true);

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "0.6.0", "version": "0.7.0-beta.0",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software", "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",

View file

@ -94,6 +94,7 @@ module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
"--enable-features=NetworkService,NetworkServiceInProcess", "--enable-features=NetworkService,NetworkServiceInProcess",
"--autoplay-policy=no-user-gesture-required", "--autoplay-policy=no-user-gesture-required",
"--disable-features=IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,AcceptCHFrame,AutoExpandDetailsElement", "--disable-features=IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,AcceptCHFrame,AutoExpandDetailsElement",
"--disable-site-isolation-trials",
"--disable-popup-blocking", "--disable-popup-blocking",
"--disable-backgrounding-occluded-windows", "--disable-backgrounding-occluded-windows",
`--user-agent=${userAgent || getDefaultUA()}`, `--user-agent=${userAgent || getDefaultUA()}`,