mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Merge branch 'recorder-work' into recorder-work-ts
This commit is contained in:
commit
df0fe887ce
10 changed files with 23 additions and 45 deletions
|
@ -20,7 +20,6 @@ ENV PROXY_HOST=localhost \
|
|||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt /app/
|
||||
RUN pip install 'uwsgi==2.0.21'
|
||||
RUN pip install -U setuptools; pip install -r requirements.txt
|
||||
|
||||
ADD package.json /app/
|
||||
|
|
|
@ -1,4 +1 @@
|
|||
pywb>=2.7.4
|
||||
uwsgi
|
||||
wacz>=0.4.9
|
||||
requests[socks]
|
||||
|
|
|
@ -756,11 +756,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
async getInfoString() {
|
||||
const packageFileJSON = JSON.parse(await fsp.readFile(new URL("../package.json", import.meta.url), {encoding: "utf-8"}));
|
||||
const warcioPackageJSON = JSON.parse(await fsp.readFile(new URL("../node_modules/warcio/package.json", import.meta.url), {encoding: "utf-8"}));
|
||||
const pywbVersion = "0.0";//child_process.execSync("pywb -V", {encoding: "utf8"}).trim().split(" ")[1];
|
||||
const packageFileJSON = JSON.parse(await fsp.readFile("../app/package.json"));
|
||||
const warcioPackageJSON = JSON.parse(await fsp.readFile("/app/node_modules/warcio/package.json"));
|
||||
|
||||
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version} pywb ${pywbVersion})`;
|
||||
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
||||
}
|
||||
|
||||
async createWARCInfo(filename: string) {
|
||||
|
@ -970,7 +969,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
headless: this.params.headless,
|
||||
emulateDevice: this.emulateDevice,
|
||||
chromeOptions: {
|
||||
proxy: false,//!process.env.NO_PROXY,
|
||||
proxy: false,
|
||||
userAgent: this.emulateDevice.userAgent,
|
||||
extraArgs: this.extraChromeArgs()
|
||||
},
|
||||
|
@ -980,7 +979,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
} as any);
|
||||
|
||||
//const archiveDir = path.join(this.collDir, "archive");
|
||||
|
||||
// --------------
|
||||
// Run Crawl Here!
|
||||
|
@ -998,9 +996,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
await this.writeStats();
|
||||
|
||||
// extra wait for all resources to land into WARCs
|
||||
// now happens at end of each page
|
||||
// await this.awaitPendingClear();
|
||||
|
||||
// if crawl has been stopped, mark as final exit for post-crawl tasks
|
||||
if (await this.crawlState.isCrawlStopped()) {
|
||||
|
@ -1019,7 +1014,17 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info("Generating CDX");
|
||||
await fsp.mkdir(path.join(this.collDir, "indexes"), {recursive: true});
|
||||
await this.crawlState.setStatus("generate-cdx");
|
||||
const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
|
||||
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||
const warcListFull = warcList.map((filename) => path.join(this.collDir, "archive", filename));
|
||||
|
||||
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
const params = [
|
||||
"-o",
|
||||
path.join(this.collDir, "indexes", "index.cdxj"),
|
||||
...warcListFull
|
||||
];
|
||||
const indexResult = await this.awaitProcess(child_process.spawn("cdxj-indexer", params, {cwd: this.params.cwd}));
|
||||
if (indexResult === 0) {
|
||||
logger.debug("Indexing complete, CDX successfully created");
|
||||
} else {
|
||||
|
|
|
@ -11,7 +11,6 @@ import yargs, { Options } from "yargs";
|
|||
|
||||
import { logger } from "./util/logger.js";
|
||||
|
||||
import { sleep } from "./util/timing.js";
|
||||
import { Browser } from "./util/browser.js";
|
||||
import { initStorage } from "./util/storage.js";
|
||||
import { CDPSession, Page, Protocol, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||
|
@ -144,18 +143,6 @@ async function main() {
|
|||
]);
|
||||
}
|
||||
|
||||
let useProxy = false;
|
||||
|
||||
if (params.proxy) {
|
||||
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
|
||||
|
||||
logger.debug("Running with pywb proxy");
|
||||
|
||||
await sleep(3000);
|
||||
|
||||
useProxy = true;
|
||||
}
|
||||
|
||||
const browser = new Browser();
|
||||
|
||||
await browser.launch({
|
||||
|
@ -163,7 +150,7 @@ async function main() {
|
|||
headless: params.headless,
|
||||
signals: true,
|
||||
chromeOptions: {
|
||||
proxy: useProxy,
|
||||
proxy: false,
|
||||
extraArgs: [
|
||||
"--window-position=0,0",
|
||||
`--window-size=${params.windowSize}`,
|
||||
|
|
|
@ -178,7 +178,7 @@ class ArgParser {
|
|||
},
|
||||
|
||||
"logging": {
|
||||
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
|
||||
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
|
||||
type: "array",
|
||||
default: ["stats"],
|
||||
coerce,
|
||||
|
|
|
@ -342,7 +342,6 @@ export class Browser
|
|||
|
||||
for (const recorder of this.recorders) {
|
||||
if (recorder.swUrls.has(request.url)) {
|
||||
//console.log(`*** found sw ${request.url} in recorder for worker ${recorder.workerid}`);
|
||||
recorder.swFrameIds.add(frameId);
|
||||
}
|
||||
|
||||
|
@ -353,7 +352,7 @@ export class Browser
|
|||
}
|
||||
|
||||
if (!foundRecorder) {
|
||||
logger.warn("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
|
||||
logger.debug("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
|
||||
|
||||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
||||
|
|
|
@ -430,17 +430,15 @@ export class Recorder
|
|||
startPage({pageid, url} : {pageid: string, url: string}) {
|
||||
this.pageid = pageid;
|
||||
this.logDetails = {page: url, workerid: this.workerid};
|
||||
// if (this.pendingRequests && this.pendingRequests.size) {
|
||||
// logger.warn("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
|
||||
// }
|
||||
if (this.pendingRequests && this.pendingRequests.size) {
|
||||
logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
|
||||
}
|
||||
this.pendingRequests = new Map();
|
||||
this.skipIds = new Set();
|
||||
this.skipping = false;
|
||||
}
|
||||
|
||||
async finishPage() {
|
||||
//this.skipping = true;
|
||||
|
||||
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
||||
if (reqresp.payload) {
|
||||
this.removeReqResp(requestId);
|
||||
|
@ -474,7 +472,7 @@ export class Recorder
|
|||
}
|
||||
|
||||
async onClosePage() {
|
||||
|
||||
// Any page-specific handling before page is closed.
|
||||
}
|
||||
|
||||
async onDone() {
|
||||
|
@ -709,7 +707,6 @@ class AsyncFetcher
|
|||
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = undefined, ignoreDupe = false} :
|
||||
{tempdir: string, reqresp: RequestResponseInfo, expectedSize?: number, recorder: Recorder,
|
||||
networkId: string, filter?: (resp: Response) => boolean, ignoreDupe?: boolean }) {
|
||||
//super();
|
||||
this.reqresp = reqresp;
|
||||
this.reqresp.expectedSize = expectedSize;
|
||||
this.reqresp.asyncLoading = true;
|
||||
|
|
|
@ -76,7 +76,6 @@ export class RequestResponseInfo
|
|||
this.resourceType = params.type;
|
||||
}
|
||||
|
||||
//this.loaderId = params.loaderId;
|
||||
}
|
||||
|
||||
fillFetchRequestPaused(params: Record<string, any>) {
|
||||
|
|
|
@ -72,7 +72,6 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
|
||||
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
|
||||
let total = 0;
|
||||
let count = 0;
|
||||
const url = record.warcTargetURI;
|
||||
|
||||
if (!this.fh) {
|
||||
|
@ -81,15 +80,11 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
|
||||
for await (const chunk of serializer) {
|
||||
total += chunk.length;
|
||||
count++;
|
||||
try {
|
||||
this.fh.write(chunk);
|
||||
} catch (e) {
|
||||
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
|
||||
}
|
||||
if (!(count % 10)) {
|
||||
//logNetwork("Writing WARC Chunk", {total, count, url, logDetails});
|
||||
}
|
||||
}
|
||||
|
||||
return total;
|
||||
|
|
|
@ -21,7 +21,7 @@ test("check that the warcinfo file works as expected on the command line", async
|
|||
|
||||
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
||||
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
||||
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+ pywb \d[\w.-]+\)/)).not.toEqual(null);
|
||||
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null);
|
||||
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue