diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 39972502..7eb2b1bb 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -45,7 +45,7 @@ jobs:
- name: validate existing wacz
run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
- name: unzip wacz
- run: docker-compose run crawler unzip collections/wr-net/wr-net.wacz -d collections/wr-net/wacz
+ run: sudo unzip crawls/collections/wr-net/wr-net.wacz -d crawls/collections/wr-net/wacz
- name: run jest
run: sudo yarn jest
diff --git a/Dockerfile b/Dockerfile
index f8d7a359..a28b8658 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,17 +1,31 @@
ARG BROWSER_VERSION=90
-FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome
+ARG BROWSER_IMAGE_BASE=oldwebtoday/chrome
-FROM nikolaik/python-nodejs:python3.8-nodejs14
+ARG BROWSER_BIN=google-chrome
-RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add -
+FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} as chrome
-RUN apt-get update -y \
- && apt-get install --no-install-recommends -qqy fonts-stix locales-all redis-server xvfb \
+FROM ubuntu:bionic
+
+RUN apt-get update -y && apt-get install --no-install-recommends -qqy software-properties-common \
+ && add-apt-repository -y ppa:deadsnakes \
+ && apt-get update -y \
+ && apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git \
+ python3.8 python3.8-distutils python3.8-dev gpg ca-certificates \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
+RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \
+ && echo "deb https://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \
+ && curl -sL https://deb.nodesource.com/setup_16.x -o /tmp/nodesource_setup.sh && bash /tmp/nodesource_setup.sh \
+ && apt-get update -y && apt-get install -qqy nodejs yarn \
+ && curl https://bootstrap.pypa.io/get-pip.py | python3.8 \
+ && pip install -U setuptools
+
+# needed to add args to main build stage
ARG BROWSER_VERSION
+ARG BROWSER_BIN
ENV PROXY_HOST=localhost \
PROXY_PORT=8080 \
@@ -19,7 +33,8 @@ ENV PROXY_HOST=localhost \
PROXY_CA_FILE=/tmp/proxy-ca.pem \
DISPLAY=:99 \
GEOMETRY=1360x1020x16 \
- BROWSER_VERSION=${BROWSER_VERSION}
+ BROWSER_VERSION=${BROWSER_VERSION} \
+ BROWSER_BIN=${BROWSER_BIN}
COPY --from=chrome /tmp/*.deb /deb/
COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
diff --git a/README.md b/README.md
index 9de5f4f0..76d0d1f9 100644
--- a/README.md
+++ b/README.md
@@ -37,13 +37,14 @@ Here's how you can use some of the command-line options to configure the crawl:
- To run more than one browser worker and crawl in parallel, and `--workers N` where N is number of browsers to run in parallel. More browsers will require more CPU and network bandwidth, and does not guarantee faster crawling.
- To crawl into a new directory, specify a different name for the `--collection` param, or, if omitted, a new collection directory based on current time will be created.
--
Browsertrix Crawler includes a number of additional command-line options, explained below.
## Crawling Configuration Options
-The Browsertrix Crawler docker image currently accepts the following parameters:
+
+
+ The Browsertrix Crawler docker image currently accepts the following parameters:
```
crawler [options]
@@ -136,6 +137,8 @@ Options:
command line will take precedence.
[string]
```
+
+
For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
@@ -238,7 +241,7 @@ The current profile creation script is still experimental and the script attempt
The Docker container provided here packages up several components used in Browsertrix.
The system uses:
- - `oldwebtoday/chrome` - to install a recent version of Chrome (currently chrome:84)
+ - `oldwebtoday/chrome` or `oldwebtoday/chromium` - to install a recent version of Chrome (currently chrome:90) or Chromium (see below).
- `puppeteer-cluster` - for running Chrome browsers in parallel
- `pywb` - in recording mode for capturing the content
@@ -247,6 +250,19 @@ The crawl produces a single pywb collection, at `/crawls/collections/, Webrecorder Software",
diff --git a/requirements.txt b/requirements.txt
index 52214951..087edfcb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-pywb>=2.6.0b2
+pywb>=2.6.0b3
#git+https://github.com/webrecorder/pywb@main
uwsgi
wacz>=0.3.0
diff --git a/util/constants.js b/util/constants.js
index c507c392..34f57c5a 100644
--- a/util/constants.js
+++ b/util/constants.js
@@ -2,5 +2,5 @@
module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
module.exports.BEHAVIOR_LOG_FUNC = "__bx_log";
-module.exports.CHROME_PATH = "google-chrome";
+module.exports.BROWSER_BIN = process.env.BROWSER_BIN || "google-chrome";