mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
support custom crawl directory with --cwd flag, default to /crawls
update README
This commit is contained in:
parent
e2bce2f30d
commit
8f740d4e24
5 changed files with 29 additions and 20 deletions
|
@ -34,7 +34,7 @@ ADD *.js /app/
|
|||
|
||||
RUN ln -s /app/main.js /usr/bin/crawl
|
||||
|
||||
WORKDIR /output
|
||||
WORKDIR /crawls
|
||||
|
||||
CMD ["crawl"]
|
||||
|
||||
|
|
14
README.md
14
README.md
|
@ -26,9 +26,9 @@ The system uses:
|
|||
- `pywb` - in recording mode for capturing the content
|
||||
|
||||
|
||||
The crawl produces a single pywb collection, at `/output/collections/<collection name>` in the Docker container.
|
||||
The crawl produces a single pywb collection, at `/crawls/collections/<collection name>` in the Docker container.
|
||||
|
||||
To access the contents of the crawl, the `/output` directory should be mounted to a volume (default in the Docker Compose setup).
|
||||
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
|
||||
|
||||
|
||||
## Crawling Parameters
|
||||
|
@ -62,9 +62,11 @@ Options:
|
|||
--headless Run in headless mode, otherwise start xvfb
|
||||
[boolean] [default: false]
|
||||
--driver JS driver for the crawler
|
||||
[string] [default: "/Users/ilya/work/browsertrix-crawler/defaultDriver.js"]
|
||||
[string] [default: "/app/defaultDriver.js"]
|
||||
--generateCDX If set, generate index (CDXJ) for use with pywb after crawl
|
||||
is done [boolean] [default: false]
|
||||
--cwd Crawl working directory for captures (pywb root). If not
|
||||
set, defaults to process.cwd [string] [default: "/crawls"]
|
||||
```
|
||||
|
||||
For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
|
||||
|
@ -86,9 +88,9 @@ docker-compose build
|
|||
docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --collection wr-net --workers 2
|
||||
```
|
||||
|
||||
While the crawl is running, puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log.
|
||||
In this example, the crawl data is written to `./crawls/collections/wr-net` by default.
|
||||
|
||||
The output is written to `./crawls/collections/wr-net` by default.
|
||||
While the crawl is running, the status of the crawl (provide by puppeteer-cluster monitoring) prints the progress to the Docker log.
|
||||
|
||||
When done, you can even use the browsertrix-crawler image to also start a local [pywb](https://github.com/webrecorder/pywb) instance
|
||||
to preview the crawl:
|
||||
|
@ -110,7 +112,7 @@ flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppetee
|
|||
|
||||
|
||||
```bash
|
||||
docker run -v $PWD/crawls:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2
|
||||
docker run -v $PWD/crawls:/crawls --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2
|
||||
|
||||
```
|
||||
|
||||
|
|
20
crawler.js
20
crawler.js
|
@ -16,16 +16,6 @@ const HTTPS_AGENT = require("https").Agent({
|
|||
|
||||
const HTTP_AGENT = require("http").Agent();
|
||||
|
||||
process.once("SIGINT", () => {
|
||||
console.log("SIGINT received, exiting");
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
process.once("SIGTERM", () => {
|
||||
console.log("SIGTERM received, exiting");
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
|
||||
// ============================================================================
|
||||
class Crawler {
|
||||
|
@ -52,7 +42,7 @@ class Crawler {
|
|||
}
|
||||
|
||||
bootstrap() {
|
||||
const opts = {stdio: "ignore"};
|
||||
const opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||
|
||||
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
|
||||
|
||||
|
@ -153,6 +143,12 @@ class Crawler {
|
|||
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"cwd": {
|
||||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
|
||||
type: "string",
|
||||
default: process.cwd(),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -307,7 +303,7 @@ class Crawler {
|
|||
if (this.params.generateCdx) {
|
||||
console.log("Generate CDX");
|
||||
|
||||
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit"});
|
||||
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ services:
|
|||
context: ./
|
||||
|
||||
volumes:
|
||||
- ./crawls:/output
|
||||
- ./crawls:/crawls
|
||||
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
|
|
11
main.js
11
main.js
|
@ -1,5 +1,16 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
process.once("SIGINT", () => {
|
||||
console.log("SIGINT received, exiting");
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
process.once("SIGTERM", () => {
|
||||
console.log("SIGTERM received, exiting");
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
|
||||
const { Crawler } = require("./crawler");
|
||||
|
||||
new Crawler().run();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue