diff --git a/Dockerfile b/Dockerfile index 1ece522a..dc0226fd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,7 +34,7 @@ ADD *.js /app/ RUN ln -s /app/main.js /usr/bin/crawl -WORKDIR /output +WORKDIR /crawls CMD ["crawl"] diff --git a/README.md b/README.md index 3169d3ab..cc835e6a 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,9 @@ The system uses: - `pywb` - in recording mode for capturing the content -The crawl produces a single pywb collection, at `/output/collections/` in the Docker container. +The crawl produces a single pywb collection, at `/crawls/collections/` in the Docker container. -To access the contents of the crawl, the `/output` directory should be mounted to a volume (default in the Docker Compose setup). +To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup). ## Crawling Parameters @@ -62,9 +62,11 @@ Options: --headless Run in headless mode, otherwise start xvfb [boolean] [default: false] --driver JS driver for the crawler - [string] [default: "/Users/ilya/work/browsertrix-crawler/defaultDriver.js"] + [string] [default: "/app/defaultDriver.js"] --generateCDX If set, generate index (CDXJ) for use with pywb after crawl is done [boolean] [default: false] + --cwd Crawl working directory for captures (pywb root). If not + set, defaults to process.cwd [string] [default: "/crawls"] ``` For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). @@ -86,9 +88,9 @@ docker-compose build docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --collection wr-net --workers 2 ``` -While the crawl is running, puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log. +In this example, the crawl data is written to `./crawls/collections/wr-net` by default. -The output is written to `./crawls/collections/wr-net` by default. +While the crawl is running, the status of the crawl (provide by puppeteer-cluster monitoring) prints the progress to the Docker log. When done, you can even use the browsertrix-crawler image to also start a local [pywb](https://github.com/webrecorder/pywb) instance to preview the crawl: @@ -110,7 +112,7 @@ flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppetee ```bash -docker run -v $PWD/crawls:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2 +docker run -v $PWD/crawls:/crawls --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2 ``` diff --git a/crawler.js b/crawler.js index 7a8b9d6c..89b9da70 100644 --- a/crawler.js +++ b/crawler.js @@ -16,16 +16,6 @@ const HTTPS_AGENT = require("https").Agent({ const HTTP_AGENT = require("http").Agent(); -process.once("SIGINT", () => { - console.log("SIGINT received, exiting"); - process.exit(1); -}); - -process.once("SIGTERM", () => { - console.log("SIGTERM received, exiting"); - process.exit(1); -}); - // ============================================================================ class Crawler { @@ -52,7 +42,7 @@ class Crawler { } bootstrap() { - const opts = {stdio: "ignore"}; + const opts = {stdio: "ignore", cwd: this.params.cwd}; child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}); @@ -153,6 +143,12 @@ class Crawler { describe: "If set, generate index (CDXJ) for use with pywb after crawl is done", type: "boolean", default: false, + }, + + "cwd": { + describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd", + type: "string", + default: process.cwd(), } }; } @@ -307,7 +303,7 @@ class Crawler { if (this.params.generateCdx) { console.log("Generate CDX"); - child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit"}); + child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd}); } } diff --git a/docker-compose.yml b/docker-compose.yml index e37ab1ed..37bda52d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,7 +7,7 @@ services: context: ./ volumes: - - ./crawls:/output + - ./crawls:/crawls cap_add: - NET_ADMIN diff --git a/main.js b/main.js index a97067f3..d295ce6f 100755 --- a/main.js +++ b/main.js @@ -1,5 +1,16 @@ #!/usr/bin/env node +process.once("SIGINT", () => { + console.log("SIGINT received, exiting"); + process.exit(1); +}); + +process.once("SIGTERM", () => { + console.log("SIGTERM received, exiting"); + process.exit(1); +}); + + const { Crawler } = require("./crawler"); new Crawler().run();