support custom crawl directory with --cwd flag, default to /crawls

update README
This commit is contained in:
Ilya Kreymer 2020-11-02 15:28:19 +00:00
parent e2bce2f30d
commit 8f740d4e24
5 changed files with 29 additions and 20 deletions

View file

@ -34,7 +34,7 @@ ADD *.js /app/
RUN ln -s /app/main.js /usr/bin/crawl
WORKDIR /output
WORKDIR /crawls
CMD ["crawl"]

View file

@ -26,9 +26,9 @@ The system uses:
- `pywb` - in recording mode for capturing the content
The crawl produces a single pywb collection, at `/output/collections/<collection name>` in the Docker container.
The crawl produces a single pywb collection, at `/crawls/collections/<collection name>` in the Docker container.
To access the contents of the crawl, the `/output` directory should be mounted to a volume (default in the Docker Compose setup).
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
## Crawling Parameters
@ -62,9 +62,11 @@ Options:
--headless Run in headless mode, otherwise start xvfb
[boolean] [default: false]
--driver JS driver for the crawler
[string] [default: "/Users/ilya/work/browsertrix-crawler/defaultDriver.js"]
[string] [default: "/app/defaultDriver.js"]
--generateCDX If set, generate index (CDXJ) for use with pywb after crawl
is done [boolean] [default: false]
--cwd Crawl working directory for captures (pywb root). If not
set, defaults to process.cwd [string] [default: "/crawls"]
```
For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
@ -86,9 +88,9 @@ docker-compose build
docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --collection wr-net --workers 2
```
While the crawl is running, puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log.
In this example, the crawl data is written to `./crawls/collections/wr-net` by default.
The output is written to `./crawls/collections/wr-net` by default.
While the crawl is running, the status of the crawl (provide by puppeteer-cluster monitoring) prints the progress to the Docker log.
When done, you can even use the browsertrix-crawler image to also start a local [pywb](https://github.com/webrecorder/pywb) instance
to preview the crawl:
@ -110,7 +112,7 @@ flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppetee
```bash
docker run -v $PWD/crawls:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2
docker run -v $PWD/crawls:/crawls --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2
```

View file

@ -16,16 +16,6 @@ const HTTPS_AGENT = require("https").Agent({
const HTTP_AGENT = require("http").Agent();
process.once("SIGINT", () => {
console.log("SIGINT received, exiting");
process.exit(1);
});
process.once("SIGTERM", () => {
console.log("SIGTERM received, exiting");
process.exit(1);
});
// ============================================================================
class Crawler {
@ -52,7 +42,7 @@ class Crawler {
}
bootstrap() {
const opts = {stdio: "ignore"};
const opts = {stdio: "ignore", cwd: this.params.cwd};
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
@ -153,6 +143,12 @@ class Crawler {
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
type: "boolean",
default: false,
},
"cwd": {
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
type: "string",
default: process.cwd(),
}
};
}
@ -307,7 +303,7 @@ class Crawler {
if (this.params.generateCdx) {
console.log("Generate CDX");
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit"});
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
}
}

View file

@ -7,7 +7,7 @@ services:
context: ./
volumes:
- ./crawls:/output
- ./crawls:/crawls
cap_add:
- NET_ADMIN

11
main.js
View file

@ -1,5 +1,16 @@
#!/usr/bin/env node
process.once("SIGINT", () => {
console.log("SIGINT received, exiting");
process.exit(1);
});
process.once("SIGTERM", () => {
console.log("SIGTERM received, exiting");
process.exit(1);
});
const { Crawler } = require("./crawler");
new Crawler().run();