mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Update to Chrome/Chromium 101 - (0.7.0 Beta 0) (#144)
* update base image - switch to browsertrix-base-image:101 with chrome/chromium 101, - includes additional fonts and ubuntu 22.04 as base. - add --disable-site-isolation-trials as default flag to support behaviors accessing iframes * debugging support for shared redis state: - support pausing crawler indefinitely if crawl state is set to 'debug' - must be set/unset manually via external redis - designed for browsertrix-cloud for now bump to 0.7.0-beta.0
This commit is contained in:
parent
cf90304fa7
commit
0a309af740
4 changed files with 25 additions and 39 deletions
40
Dockerfile
40
Dockerfile
|
@ -1,35 +1,10 @@
|
|||
ARG BROWSER_VERSION=91
|
||||
|
||||
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base
|
||||
ARG BROWSER_VERSION=101
|
||||
|
||||
ARG BROWSER_BIN=google-chrome
|
||||
|
||||
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} AS browser
|
||||
|
||||
FROM ubuntu:bionic
|
||||
|
||||
# http://bugs.python.org/issue19846
|
||||
# > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK.
|
||||
ENV LANG C.UTF-8
|
||||
|
||||
RUN apt-get update -y && apt-get install --no-install-recommends -qqy software-properties-common \
|
||||
&& add-apt-repository -y ppa:deadsnakes \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git socat \
|
||||
python3.8 python3.8-distutils python3.8-dev gpg ca-certificates \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \
|
||||
&& echo "deb https://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \
|
||||
&& curl -sL https://deb.nodesource.com/setup_16.x -o /tmp/nodesource_setup.sh && bash /tmp/nodesource_setup.sh \
|
||||
&& apt-get update -y && apt-get install -qqy nodejs yarn \
|
||||
&& curl https://bootstrap.pypa.io/get-pip.py | python3.8 \
|
||||
&& pip install 'setuptools<58.0'
|
||||
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}
|
||||
|
||||
# needed to add args to main build stage
|
||||
ARG BROWSER_VERSION
|
||||
ARG BROWSER_BIN
|
||||
|
||||
ENV PROXY_HOST=localhost \
|
||||
PROXY_PORT=8080 \
|
||||
|
@ -38,16 +13,12 @@ ENV PROXY_HOST=localhost \
|
|||
DISPLAY=:99 \
|
||||
GEOMETRY=1360x1020x16 \
|
||||
BROWSER_VERSION=${BROWSER_VERSION} \
|
||||
BROWSER_BIN=${BROWSER_BIN}
|
||||
|
||||
COPY --from=browser /deb/*.deb /deb/
|
||||
RUN dpkg -i /deb/*.deb; apt-get update; apt-mark hold chromium-browser; apt --fix-broken install -qqy; \
|
||||
rm -rf /var/lib/opts/lists/*
|
||||
BROWSER_BIN=google-chrome
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt /app/
|
||||
RUN pip install -r requirements.txt
|
||||
RUN pip install -U setuptools; pip install -r requirements.txt
|
||||
|
||||
ADD package.json /app/
|
||||
|
||||
|
@ -62,8 +33,7 @@ ADD util/*.js /app/util/
|
|||
COPY config.yaml /app/
|
||||
ADD html/ /app/html/
|
||||
|
||||
RUN ln -s /app/main.js /usr/bin/crawl
|
||||
RUN ln -s /app/create-login-profile.js /usr/bin/create-login-profile
|
||||
RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/bin/create-login-profile
|
||||
|
||||
WORKDIR /crawls
|
||||
|
||||
|
|
19
crawler.js
19
crawler.js
|
@ -168,7 +168,6 @@ class Crawler {
|
|||
this.statusLog(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`);
|
||||
|
||||
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname());
|
||||
await this.crawlState.setStatus("running");
|
||||
|
||||
} else {
|
||||
this.statusLog("Storing state in memory");
|
||||
|
@ -292,7 +291,9 @@ class Crawler {
|
|||
} finally {
|
||||
console.log(status);
|
||||
|
||||
if (this.crawlState) {
|
||||
await this.crawlState.setStatus(status);
|
||||
}
|
||||
|
||||
process.exit(this.exitCode);
|
||||
}
|
||||
|
@ -447,6 +448,18 @@ class Crawler {
|
|||
return;
|
||||
}
|
||||
|
||||
await this.initCrawlState();
|
||||
|
||||
let initState = await this.crawlState.getStatus();
|
||||
|
||||
while (initState === "debug") {
|
||||
console.log("Paused for debugging, will continue after manual resume");
|
||||
|
||||
await this.sleep(60);
|
||||
|
||||
initState = await this.crawlState.getStatus();
|
||||
}
|
||||
|
||||
if (this.params.generateWACZ) {
|
||||
this.storage = initStorage();
|
||||
}
|
||||
|
@ -463,7 +476,9 @@ class Crawler {
|
|||
});
|
||||
|
||||
|
||||
this.cluster.jobQueue = await this.initCrawlState();
|
||||
this.cluster.jobQueue = this.crawlState;
|
||||
|
||||
await this.crawlState.setStatus("running");
|
||||
|
||||
if (this.params.state) {
|
||||
await this.crawlState.load(this.params.state, this.params.scopedSeeds, true);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.6.0",
|
||||
"version": "0.7.0-beta.0",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
|
|
@ -94,6 +94,7 @@ module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
|
|||
"--enable-features=NetworkService,NetworkServiceInProcess",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-features=IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,AcceptCHFrame,AutoExpandDetailsElement",
|
||||
"--disable-site-isolation-trials",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
`--user-agent=${userAgent || getDefaultUA()}`,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue