Merge branch 'recorder-work' into recorder-work-ts

This commit is contained in:
Ilya Kreymer 2023-11-07 22:01:37 -08:00
commit df0fe887ce
10 changed files with 23 additions and 45 deletions

View file

@ -20,7 +20,6 @@ ENV PROXY_HOST=localhost \
WORKDIR /app
ADD requirements.txt /app/
RUN pip install 'uwsgi==2.0.21'
RUN pip install -U setuptools; pip install -r requirements.txt
ADD package.json /app/

View file

@ -1,4 +1 @@
pywb>=2.7.4
uwsgi
wacz>=0.4.9
requests[socks]

View file

@ -756,11 +756,10 @@ self.__bx_behaviors.selectMainBehavior();
}
async getInfoString() {
const packageFileJSON = JSON.parse(await fsp.readFile(new URL("../package.json", import.meta.url), {encoding: "utf-8"}));
const warcioPackageJSON = JSON.parse(await fsp.readFile(new URL("../node_modules/warcio/package.json", import.meta.url), {encoding: "utf-8"}));
const pywbVersion = "0.0";//child_process.execSync("pywb -V", {encoding: "utf8"}).trim().split(" ")[1];
const packageFileJSON = JSON.parse(await fsp.readFile("../app/package.json"));
const warcioPackageJSON = JSON.parse(await fsp.readFile("/app/node_modules/warcio/package.json"));
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version} pywb ${pywbVersion})`;
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
}
async createWARCInfo(filename: string) {
@ -970,7 +969,7 @@ self.__bx_behaviors.selectMainBehavior();
headless: this.params.headless,
emulateDevice: this.emulateDevice,
chromeOptions: {
proxy: false,//!process.env.NO_PROXY,
proxy: false,
userAgent: this.emulateDevice.userAgent,
extraArgs: this.extraChromeArgs()
},
@ -980,7 +979,6 @@ self.__bx_behaviors.selectMainBehavior();
}
} as any);
//const archiveDir = path.join(this.collDir, "archive");
// --------------
// Run Crawl Here!
@ -998,9 +996,6 @@ self.__bx_behaviors.selectMainBehavior();
await this.writeStats();
// extra wait for all resources to land into WARCs
// now happens at end of each page
// await this.awaitPendingClear();
// if crawl has been stopped, mark as final exit for post-crawl tasks
if (await this.crawlState.isCrawlStopped()) {
@ -1019,7 +1014,17 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("Generating CDX");
await fsp.mkdir(path.join(this.collDir, "indexes"), {recursive: true});
await this.crawlState.setStatus("generate-cdx");
const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
const warcListFull = warcList.map((filename) => path.join(this.collDir, "archive", filename));
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
const params = [
"-o",
path.join(this.collDir, "indexes", "index.cdxj"),
...warcListFull
];
const indexResult = await this.awaitProcess(child_process.spawn("cdxj-indexer", params, {cwd: this.params.cwd}));
if (indexResult === 0) {
logger.debug("Indexing complete, CDX successfully created");
} else {

View file

@ -11,7 +11,6 @@ import yargs, { Options } from "yargs";
import { logger } from "./util/logger.js";
import { sleep } from "./util/timing.js";
import { Browser } from "./util/browser.js";
import { initStorage } from "./util/storage.js";
import { CDPSession, Page, Protocol, PuppeteerLifeCycleEvent } from "puppeteer-core";
@ -144,18 +143,6 @@ async function main() {
]);
}
let useProxy = false;
if (params.proxy) {
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
logger.debug("Running with pywb proxy");
await sleep(3000);
useProxy = true;
}
const browser = new Browser();
await browser.launch({
@ -163,7 +150,7 @@ async function main() {
headless: params.headless,
signals: true,
chromeOptions: {
proxy: useProxy,
proxy: false,
extraArgs: [
"--window-position=0,0",
`--window-size=${params.windowSize}`,

View file

@ -178,7 +178,7 @@ class ArgParser {
},
"logging": {
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
type: "array",
default: ["stats"],
coerce,

View file

@ -342,7 +342,6 @@ export class Browser
for (const recorder of this.recorders) {
if (recorder.swUrls.has(request.url)) {
//console.log(`*** found sw ${request.url} in recorder for worker ${recorder.workerid}`);
recorder.swFrameIds.add(frameId);
}
@ -353,7 +352,7 @@ export class Browser
}
if (!foundRecorder) {
logger.warn("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
logger.debug("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
try {
await this.firstCDP.send("Fetch.continueResponse", {requestId});

View file

@ -430,17 +430,15 @@ export class Recorder
startPage({pageid, url} : {pageid: string, url: string}) {
this.pageid = pageid;
this.logDetails = {page: url, workerid: this.workerid};
// if (this.pendingRequests && this.pendingRequests.size) {
// logger.warn("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
// }
if (this.pendingRequests && this.pendingRequests.size) {
logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
}
this.pendingRequests = new Map();
this.skipIds = new Set();
this.skipping = false;
}
async finishPage() {
//this.skipping = true;
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
if (reqresp.payload) {
this.removeReqResp(requestId);
@ -474,7 +472,7 @@ export class Recorder
}
async onClosePage() {
// Any page-specific handling before page is closed.
}
async onDone() {
@ -709,7 +707,6 @@ class AsyncFetcher
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = undefined, ignoreDupe = false} :
{tempdir: string, reqresp: RequestResponseInfo, expectedSize?: number, recorder: Recorder,
networkId: string, filter?: (resp: Response) => boolean, ignoreDupe?: boolean }) {
//super();
this.reqresp = reqresp;
this.reqresp.expectedSize = expectedSize;
this.reqresp.asyncLoading = true;

View file

@ -76,7 +76,6 @@ export class RequestResponseInfo
this.resourceType = params.type;
}
//this.loaderId = params.loaderId;
}
fillFetchRequestPaused(params: Record<string, any>) {

View file

@ -72,7 +72,6 @@ export class WARCWriter implements IndexerOffsetLength
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
let total = 0;
let count = 0;
const url = record.warcTargetURI;
if (!this.fh) {
@ -81,15 +80,11 @@ export class WARCWriter implements IndexerOffsetLength
for await (const chunk of serializer) {
total += chunk.length;
count++;
try {
this.fh.write(chunk);
} catch (e) {
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
}
if (!(count % 10)) {
//logNetwork("Writing WARC Chunk", {total, count, url, logDetails});
}
}
return total;

View file

@ -21,7 +21,7 @@ test("check that the warcinfo file works as expected on the command line", async
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+ pywb \d[\w.-]+\)/)).not.toEqual(null);
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null);
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);