diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 39972502..7eb2b1bb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -45,7 +45,7 @@ jobs: - name: validate existing wacz run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz - name: unzip wacz - run: docker-compose run crawler unzip collections/wr-net/wr-net.wacz -d collections/wr-net/wacz + run: sudo unzip crawls/collections/wr-net/wr-net.wacz -d crawls/collections/wr-net/wacz - name: run jest run: sudo yarn jest diff --git a/Dockerfile b/Dockerfile index f8d7a359..a28b8658 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,31 @@ ARG BROWSER_VERSION=90 -FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome +ARG BROWSER_IMAGE_BASE=oldwebtoday/chrome -FROM nikolaik/python-nodejs:python3.8-nodejs14 +ARG BROWSER_BIN=google-chrome -RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - +FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} as chrome -RUN apt-get update -y \ - && apt-get install --no-install-recommends -qqy fonts-stix locales-all redis-server xvfb \ +FROM ubuntu:bionic + +RUN apt-get update -y && apt-get install --no-install-recommends -qqy software-properties-common \ + && add-apt-repository -y ppa:deadsnakes \ + && apt-get update -y \ + && apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git \ + python3.8 python3.8-distutils python3.8-dev gpg ca-certificates \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \ + && echo "deb https://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \ + && curl -sL https://deb.nodesource.com/setup_16.x -o /tmp/nodesource_setup.sh && bash /tmp/nodesource_setup.sh \ + && apt-get update -y && apt-get install -qqy nodejs yarn \ + && curl https://bootstrap.pypa.io/get-pip.py | python3.8 \ + && pip install -U setuptools + +# needed to add args to main build stage ARG BROWSER_VERSION +ARG BROWSER_BIN ENV PROXY_HOST=localhost \ PROXY_PORT=8080 \ @@ -19,7 +33,8 @@ ENV PROXY_HOST=localhost \ PROXY_CA_FILE=/tmp/proxy-ca.pem \ DISPLAY=:99 \ GEOMETRY=1360x1020x16 \ - BROWSER_VERSION=${BROWSER_VERSION} + BROWSER_VERSION=${BROWSER_VERSION} \ + BROWSER_BIN=${BROWSER_BIN} COPY --from=chrome /tmp/*.deb /deb/ COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so diff --git a/README.md b/README.md index 9de5f4f0..76d0d1f9 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,14 @@ Here's how you can use some of the command-line options to configure the crawl: - To run more than one browser worker and crawl in parallel, and `--workers N` where N is number of browsers to run in parallel. More browsers will require more CPU and network bandwidth, and does not guarantee faster crawling. - To crawl into a new directory, specify a different name for the `--collection` param, or, if omitted, a new collection directory based on current time will be created. -- Browsertrix Crawler includes a number of additional command-line options, explained below. ## Crawling Configuration Options -The Browsertrix Crawler docker image currently accepts the following parameters: + +
+ The Browsertrix Crawler docker image currently accepts the following parameters: ``` crawler [options] @@ -136,6 +137,8 @@ Options: command line will take precedence. [string] ``` +
+ For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). @@ -238,7 +241,7 @@ The current profile creation script is still experimental and the script attempt The Docker container provided here packages up several components used in Browsertrix. The system uses: - - `oldwebtoday/chrome` - to install a recent version of Chrome (currently chrome:84) + - `oldwebtoday/chrome` or `oldwebtoday/chromium` - to install a recent version of Chrome (currently chrome:90) or Chromium (see below). - `puppeteer-cluster` - for running Chrome browsers in parallel - `pywb` - in recording mode for capturing the content @@ -247,6 +250,19 @@ The crawl produces a single pywb collection, at `/crawls/collections/, Webrecorder Software", diff --git a/requirements.txt b/requirements.txt index 52214951..087edfcb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pywb>=2.6.0b2 +pywb>=2.6.0b3 #git+https://github.com/webrecorder/pywb@main uwsgi wacz>=0.3.0 diff --git a/util/constants.js b/util/constants.js index c507c392..34f57c5a 100644 --- a/util/constants.js +++ b/util/constants.js @@ -2,5 +2,5 @@ module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"]; module.exports.BEHAVIOR_LOG_FUNC = "__bx_log"; -module.exports.CHROME_PATH = "google-chrome"; +module.exports.BROWSER_BIN = process.env.BROWSER_BIN || "google-chrome";