support custom crawl directory with --cwd flag, default to /crawls

update README
2025-12-08 06:09:48 +00:00 · 2020-11-02 15:28:19 +00:00 · 2020-11-02 15:28:19 +00:00 · 8f740d4e24
commit 8f740d4e24
parent e2bce2f30d
5 changed files with 29 additions and 20 deletions
--- a/2
+++ b/2
@ -34,7 +34,7 @@ ADD *.js /app/

 RUN ln -s /app/main.js /usr/bin/crawl

-WORKDIR /output
+WORKDIR /crawls

 CMD ["crawl"]

--- a/README.md
+++ b/README.md
@ -26,9 +26,9 @@ The system uses:
 - `pywb` - in recording mode for capturing the content


-The crawl produces a single pywb collection, at `/output/collections/<collection name>` in the Docker container.
+The crawl produces a single pywb collection, at `/crawls/collections/<collection name>` in the Docker container.

-To access the contents of the crawl, the `/output` directory should be mounted to a volume (default in the Docker Compose setup).
+To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).


 ## Crawling Parameters
@ -62,9 +62,11 @@ Options:
      --headless     Run in headless mode, otherwise start xvfb
                                                      [boolean] [default: false]
      --driver       JS driver for the crawler
-     [string] [default: "/Users/ilya/work/browsertrix-crawler/defaultDriver.js"]
+                                     [string] [default: "/app/defaultDriver.js"]
      --generateCDX  If set, generate index (CDXJ) for use with pywb after crawl
                     is done                          [boolean] [default: false]
+      --cwd          Crawl working directory for captures (pywb root). If not
+                     set, defaults to process.cwd  [string] [default: "/crawls"]
 ```

 For the `--waitUntil` flag,  see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
@ -86,9 +88,9 @@ docker-compose build
 docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --collection wr-net --workers 2
 ```

-While the crawl is running, puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log.
+In this example, the crawl data is written to `./crawls/collections/wr-net` by default.

-The output is written to `./crawls/collections/wr-net` by default.
+While the crawl is running, the status of the crawl (provide by puppeteer-cluster monitoring) prints the progress to the Docker log.

 When done, you can even use the browsertrix-crawler image to also start a local [pywb](https://github.com/webrecorder/pywb) instance
 to preview the crawl:
@ -110,7 +112,7 @@ flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppetee


 ```bash
-docker run -v $PWD/crawls:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2
+docker run -v $PWD/crawls:/crawls --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2

 ```

--- a/crawler.js
+++ b/crawler.js
@ -16,16 +16,6 @@ const HTTPS_AGENT = require("https").Agent({

 const HTTP_AGENT = require("http").Agent();

-process.once("SIGINT", () => {
-  console.log("SIGINT received, exiting");
-  process.exit(1);
-});
-
-process.once("SIGTERM", () => {
-  console.log("SIGTERM received, exiting");
-  process.exit(1);
-});
-

 // ============================================================================
 class Crawler {
@ -52,7 +42,7 @@ class Crawler {
  }

  bootstrap() {
-    const opts = {stdio: "ignore"};
+    const opts = {stdio: "ignore", cwd: this.params.cwd};

    child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});

@ -153,6 +143,12 @@ class Crawler {
        describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
        type: "boolean",
        default: false,
+      },
+
+      "cwd": {
+        describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
+        type: "string",
+        default: process.cwd(),
      }
    };
  }
@ -307,7 +303,7 @@ class Crawler {
    if (this.params.generateCdx) {
      console.log("Generate CDX");

-      child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit"});
+      child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
    }
  }

--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -7,7 +7,7 @@ services:
          context: ./

        volumes:
-          - ./crawls:/output
+          - ./crawls:/crawls

        cap_add:
          - NET_ADMIN
--- a/main.js
+++ b/main.js
@ -1,5 +1,16 @@
 #!/usr/bin/env node

+process.once("SIGINT", () => {
+  console.log("SIGINT received, exiting");
+  process.exit(1);
+});
+
+process.once("SIGTERM", () => {
+  console.log("SIGTERM received, exiting");
+  process.exit(1);
+});
+
+
 const { Crawler } = require("./crawler");

 new Crawler().run();