remove pywb dependency

- only keep py-wacz
- use cdxj-indexer for --generateCDX
This commit is contained in:
Ilya Kreymer 2023-11-07 20:01:42 -08:00
parent 468a00939d
commit 868cd7ab48
3 changed files with 11 additions and 5 deletions

View file

@ -20,7 +20,6 @@ ENV PROXY_HOST=localhost \
WORKDIR /app
ADD requirements.txt /app/
RUN pip install 'uwsgi==2.0.21'
RUN pip install -U setuptools; pip install -r requirements.txt
ADD package.json /app/

View file

@ -937,7 +937,17 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("Generating CDX");
await fsp.mkdir(path.join(this.collDir, "indexes"), {recursive: true});
await this.crawlState.setStatus("generate-cdx");
const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
const warcListFull = warcList.map((filename) => path.join(this.collDir, "archive", filename));
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
const params = [
"-o",
path.join(this.collDir, "indexes", "index.cdxj"),
...warcListFull
];
const indexResult = await this.awaitProcess(child_process.spawn("cdxj-indexer", params, {cwd: this.params.cwd}));
if (indexResult === 0) {
logger.debug("Indexing complete, CDX successfully created");
} else {

View file

@ -1,4 +1 @@
pywb>=2.7.4
uwsgi
wacz>=0.4.9
requests[socks]