mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Merge branch 'recorder-work' into recorder-work-ts
This commit is contained in:
commit
df0fe887ce
10 changed files with 23 additions and 45 deletions
|
@ -20,7 +20,6 @@ ENV PROXY_HOST=localhost \
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
ADD requirements.txt /app/
|
ADD requirements.txt /app/
|
||||||
RUN pip install 'uwsgi==2.0.21'
|
|
||||||
RUN pip install -U setuptools; pip install -r requirements.txt
|
RUN pip install -U setuptools; pip install -r requirements.txt
|
||||||
|
|
||||||
ADD package.json /app/
|
ADD package.json /app/
|
||||||
|
|
|
@ -1,4 +1 @@
|
||||||
pywb>=2.7.4
|
|
||||||
uwsgi
|
|
||||||
wacz>=0.4.9
|
wacz>=0.4.9
|
||||||
requests[socks]
|
|
||||||
|
|
|
@ -756,11 +756,10 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
|
|
||||||
async getInfoString() {
|
async getInfoString() {
|
||||||
const packageFileJSON = JSON.parse(await fsp.readFile(new URL("../package.json", import.meta.url), {encoding: "utf-8"}));
|
const packageFileJSON = JSON.parse(await fsp.readFile("../app/package.json"));
|
||||||
const warcioPackageJSON = JSON.parse(await fsp.readFile(new URL("../node_modules/warcio/package.json", import.meta.url), {encoding: "utf-8"}));
|
const warcioPackageJSON = JSON.parse(await fsp.readFile("/app/node_modules/warcio/package.json"));
|
||||||
const pywbVersion = "0.0";//child_process.execSync("pywb -V", {encoding: "utf8"}).trim().split(" ")[1];
|
|
||||||
|
|
||||||
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version} pywb ${pywbVersion})`;
|
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
||||||
}
|
}
|
||||||
|
|
||||||
async createWARCInfo(filename: string) {
|
async createWARCInfo(filename: string) {
|
||||||
|
@ -970,7 +969,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
headless: this.params.headless,
|
headless: this.params.headless,
|
||||||
emulateDevice: this.emulateDevice,
|
emulateDevice: this.emulateDevice,
|
||||||
chromeOptions: {
|
chromeOptions: {
|
||||||
proxy: false,//!process.env.NO_PROXY,
|
proxy: false,
|
||||||
userAgent: this.emulateDevice.userAgent,
|
userAgent: this.emulateDevice.userAgent,
|
||||||
extraArgs: this.extraChromeArgs()
|
extraArgs: this.extraChromeArgs()
|
||||||
},
|
},
|
||||||
|
@ -980,7 +979,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
} as any);
|
} as any);
|
||||||
|
|
||||||
//const archiveDir = path.join(this.collDir, "archive");
|
|
||||||
|
|
||||||
// --------------
|
// --------------
|
||||||
// Run Crawl Here!
|
// Run Crawl Here!
|
||||||
|
@ -998,9 +996,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
await this.writeStats();
|
await this.writeStats();
|
||||||
|
|
||||||
// extra wait for all resources to land into WARCs
|
|
||||||
// now happens at end of each page
|
|
||||||
// await this.awaitPendingClear();
|
|
||||||
|
|
||||||
// if crawl has been stopped, mark as final exit for post-crawl tasks
|
// if crawl has been stopped, mark as final exit for post-crawl tasks
|
||||||
if (await this.crawlState.isCrawlStopped()) {
|
if (await this.crawlState.isCrawlStopped()) {
|
||||||
|
@ -1019,7 +1014,17 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.info("Generating CDX");
|
logger.info("Generating CDX");
|
||||||
await fsp.mkdir(path.join(this.collDir, "indexes"), {recursive: true});
|
await fsp.mkdir(path.join(this.collDir, "indexes"), {recursive: true});
|
||||||
await this.crawlState.setStatus("generate-cdx");
|
await this.crawlState.setStatus("generate-cdx");
|
||||||
const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
|
||||||
|
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||||
|
const warcListFull = warcList.map((filename) => path.join(this.collDir, "archive", filename));
|
||||||
|
|
||||||
|
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||||
|
const params = [
|
||||||
|
"-o",
|
||||||
|
path.join(this.collDir, "indexes", "index.cdxj"),
|
||||||
|
...warcListFull
|
||||||
|
];
|
||||||
|
const indexResult = await this.awaitProcess(child_process.spawn("cdxj-indexer", params, {cwd: this.params.cwd}));
|
||||||
if (indexResult === 0) {
|
if (indexResult === 0) {
|
||||||
logger.debug("Indexing complete, CDX successfully created");
|
logger.debug("Indexing complete, CDX successfully created");
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -11,7 +11,6 @@ import yargs, { Options } from "yargs";
|
||||||
|
|
||||||
import { logger } from "./util/logger.js";
|
import { logger } from "./util/logger.js";
|
||||||
|
|
||||||
import { sleep } from "./util/timing.js";
|
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
import { initStorage } from "./util/storage.js";
|
import { initStorage } from "./util/storage.js";
|
||||||
import { CDPSession, Page, Protocol, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
import { CDPSession, Page, Protocol, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||||
|
@ -144,18 +143,6 @@ async function main() {
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let useProxy = false;
|
|
||||||
|
|
||||||
if (params.proxy) {
|
|
||||||
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
|
|
||||||
|
|
||||||
logger.debug("Running with pywb proxy");
|
|
||||||
|
|
||||||
await sleep(3000);
|
|
||||||
|
|
||||||
useProxy = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const browser = new Browser();
|
const browser = new Browser();
|
||||||
|
|
||||||
await browser.launch({
|
await browser.launch({
|
||||||
|
@ -163,7 +150,7 @@ async function main() {
|
||||||
headless: params.headless,
|
headless: params.headless,
|
||||||
signals: true,
|
signals: true,
|
||||||
chromeOptions: {
|
chromeOptions: {
|
||||||
proxy: useProxy,
|
proxy: false,
|
||||||
extraArgs: [
|
extraArgs: [
|
||||||
"--window-position=0,0",
|
"--window-position=0,0",
|
||||||
`--window-size=${params.windowSize}`,
|
`--window-size=${params.windowSize}`,
|
||||||
|
|
|
@ -178,7 +178,7 @@ class ArgParser {
|
||||||
},
|
},
|
||||||
|
|
||||||
"logging": {
|
"logging": {
|
||||||
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
|
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: ["stats"],
|
default: ["stats"],
|
||||||
coerce,
|
coerce,
|
||||||
|
|
|
@ -342,7 +342,6 @@ export class Browser
|
||||||
|
|
||||||
for (const recorder of this.recorders) {
|
for (const recorder of this.recorders) {
|
||||||
if (recorder.swUrls.has(request.url)) {
|
if (recorder.swUrls.has(request.url)) {
|
||||||
//console.log(`*** found sw ${request.url} in recorder for worker ${recorder.workerid}`);
|
|
||||||
recorder.swFrameIds.add(frameId);
|
recorder.swFrameIds.add(frameId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -353,7 +352,7 @@ export class Browser
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!foundRecorder) {
|
if (!foundRecorder) {
|
||||||
logger.warn("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
|
logger.debug("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
||||||
|
|
|
@ -430,17 +430,15 @@ export class Recorder
|
||||||
startPage({pageid, url} : {pageid: string, url: string}) {
|
startPage({pageid, url} : {pageid: string, url: string}) {
|
||||||
this.pageid = pageid;
|
this.pageid = pageid;
|
||||||
this.logDetails = {page: url, workerid: this.workerid};
|
this.logDetails = {page: url, workerid: this.workerid};
|
||||||
// if (this.pendingRequests && this.pendingRequests.size) {
|
if (this.pendingRequests && this.pendingRequests.size) {
|
||||||
// logger.warn("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
|
logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
|
||||||
// }
|
}
|
||||||
this.pendingRequests = new Map();
|
this.pendingRequests = new Map();
|
||||||
this.skipIds = new Set();
|
this.skipIds = new Set();
|
||||||
this.skipping = false;
|
this.skipping = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
async finishPage() {
|
async finishPage() {
|
||||||
//this.skipping = true;
|
|
||||||
|
|
||||||
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
||||||
if (reqresp.payload) {
|
if (reqresp.payload) {
|
||||||
this.removeReqResp(requestId);
|
this.removeReqResp(requestId);
|
||||||
|
@ -474,7 +472,7 @@ export class Recorder
|
||||||
}
|
}
|
||||||
|
|
||||||
async onClosePage() {
|
async onClosePage() {
|
||||||
|
// Any page-specific handling before page is closed.
|
||||||
}
|
}
|
||||||
|
|
||||||
async onDone() {
|
async onDone() {
|
||||||
|
@ -709,7 +707,6 @@ class AsyncFetcher
|
||||||
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = undefined, ignoreDupe = false} :
|
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = undefined, ignoreDupe = false} :
|
||||||
{tempdir: string, reqresp: RequestResponseInfo, expectedSize?: number, recorder: Recorder,
|
{tempdir: string, reqresp: RequestResponseInfo, expectedSize?: number, recorder: Recorder,
|
||||||
networkId: string, filter?: (resp: Response) => boolean, ignoreDupe?: boolean }) {
|
networkId: string, filter?: (resp: Response) => boolean, ignoreDupe?: boolean }) {
|
||||||
//super();
|
|
||||||
this.reqresp = reqresp;
|
this.reqresp = reqresp;
|
||||||
this.reqresp.expectedSize = expectedSize;
|
this.reqresp.expectedSize = expectedSize;
|
||||||
this.reqresp.asyncLoading = true;
|
this.reqresp.asyncLoading = true;
|
||||||
|
|
|
@ -76,7 +76,6 @@ export class RequestResponseInfo
|
||||||
this.resourceType = params.type;
|
this.resourceType = params.type;
|
||||||
}
|
}
|
||||||
|
|
||||||
//this.loaderId = params.loaderId;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fillFetchRequestPaused(params: Record<string, any>) {
|
fillFetchRequestPaused(params: Record<string, any>) {
|
||||||
|
|
|
@ -72,7 +72,6 @@ export class WARCWriter implements IndexerOffsetLength
|
||||||
|
|
||||||
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
|
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
|
||||||
let total = 0;
|
let total = 0;
|
||||||
let count = 0;
|
|
||||||
const url = record.warcTargetURI;
|
const url = record.warcTargetURI;
|
||||||
|
|
||||||
if (!this.fh) {
|
if (!this.fh) {
|
||||||
|
@ -81,15 +80,11 @@ export class WARCWriter implements IndexerOffsetLength
|
||||||
|
|
||||||
for await (const chunk of serializer) {
|
for await (const chunk of serializer) {
|
||||||
total += chunk.length;
|
total += chunk.length;
|
||||||
count++;
|
|
||||||
try {
|
try {
|
||||||
this.fh.write(chunk);
|
this.fh.write(chunk);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
|
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
|
||||||
}
|
}
|
||||||
if (!(count % 10)) {
|
|
||||||
//logNetwork("Writing WARC Chunk", {total, count, url, logDetails});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return total;
|
return total;
|
||||||
|
|
|
@ -21,7 +21,7 @@ test("check that the warcinfo file works as expected on the command line", async
|
||||||
|
|
||||||
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
||||||
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
||||||
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+ pywb \d[\w.-]+\)/)).not.toEqual(null);
|
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null);
|
||||||
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
|
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue