fix indexing of cookie header: (#714)

- add fields option for adding req.http:cookie and referrer entries to
the cdxj
- update to warcio 2.4.0 to support this functionality
This commit is contained in:
Ilya Kreymer 2024-11-13 23:13:40 -08:00 committed by GitHub
parent 60c84b342e
commit f56d6505c1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 20 additions and 3 deletions

View file

@ -37,7 +37,7 @@
"tsc": "^2.0.4",
"undici": "^6.18.2",
"uuid": "8.3.2",
"warcio": "^2.3.1",
"warcio": "^2.4.0",
"ws": "^7.4.4",
"yargs": "^17.7.2"
},

View file

@ -2,7 +2,7 @@ import fs from "fs";
import { Writable } from "stream";
import path from "path";
import { CDXIndexer, WARCRecord } from "warcio";
import { CDXIndexer, WARCRecord, DEFAULT_CDX_FIELDS } from "warcio";
import { WARCSerializer } from "warcio/node";
import { logger, formatErr, LogDetails, LogContext } from "./logger.js";
import type { IndexerOffsetLength } from "warcio";
@ -76,7 +76,10 @@ export class WARCWriter implements IndexerOffsetLength {
this.recordLength = 0;
if (this.warcCdxDir) {
this.indexer = new CDXIndexer({ format: "cdxj" });
this.indexer = new CDXIndexer({
format: "cdxj",
fields: [...DEFAULT_CDX_FIELDS, "req.http:cookie", "referrer"],
});
}
return filename;

View file

@ -5295,6 +5295,20 @@ warcio@^2.3.1:
uuid-random "^1.3.2"
yargs "^17.6.2"
warcio@^2.4.0:
version "2.4.0"
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.0.tgz#13bae2837f1bbf5cf7585f75857e6311d30557bd"
integrity sha512-EfxXCgsnZ35CGf2j99QBMyB6EI98KEQ6YmeER+8Lnv/4KFJ3thT76PiX37HfZVbPJS21JihA0Eddjk9QBQRlPg==
dependencies:
"@types/pako" "^1.0.7"
"@types/stream-buffers" "^3.0.7"
base32-encode "^2.0.0"
hash-wasm "^4.9.0"
pako "^1.0.11"
tempy "^3.1.0"
uuid-random "^1.3.2"
yargs "^17.7.2"
web-encoding@^1.1.5:
version "1.1.5"
resolved "https://registry.yarnpkg.com/web-encoding/-/web-encoding-1.1.5.tgz#fc810cf7667364a6335c939913f5051d3e0c4864"