mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
remove pywb dependency
- only keep py-wacz - use cdxj-indexer for --generateCDX
This commit is contained in:
parent
468a00939d
commit
868cd7ab48
3 changed files with 11 additions and 5 deletions
|
@ -20,7 +20,6 @@ ENV PROXY_HOST=localhost \
|
|||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt /app/
|
||||
RUN pip install 'uwsgi==2.0.21'
|
||||
RUN pip install -U setuptools; pip install -r requirements.txt
|
||||
|
||||
ADD package.json /app/
|
||||
|
|
12
crawler.js
12
crawler.js
|
@ -937,7 +937,17 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info("Generating CDX");
|
||||
await fsp.mkdir(path.join(this.collDir, "indexes"), {recursive: true});
|
||||
await this.crawlState.setStatus("generate-cdx");
|
||||
const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
|
||||
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||
const warcListFull = warcList.map((filename) => path.join(this.collDir, "archive", filename));
|
||||
|
||||
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
const params = [
|
||||
"-o",
|
||||
path.join(this.collDir, "indexes", "index.cdxj"),
|
||||
...warcListFull
|
||||
];
|
||||
const indexResult = await this.awaitProcess(child_process.spawn("cdxj-indexer", params, {cwd: this.params.cwd}));
|
||||
if (indexResult === 0) {
|
||||
logger.debug("Indexing complete, CDX successfully created");
|
||||
} else {
|
||||
|
|
|
@ -1,4 +1 @@
|
|||
pywb>=2.7.4
|
||||
uwsgi
|
||||
wacz>=0.4.9
|
||||
requests[socks]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue