mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-07 13:49:47 +00:00
Add downloads dir to cache external dependency within the crawl (#921)
Fixes #920 - Downloads profile, custom behavior, and seed list to `/downloads` directory in the crawl - Seed File: Downloaded into downloads. Never refetched if already exists on subsequent crawl restarts. - Custom Behaviors: Git: Downloaded into dir, then moved to /downloads/behaviors/<dir name>. if already exist, failure to downloaded will reuse existing directory - Custom Behaviors: File: Downloaded into temp file, then moved to /downloads/behaviors/<name.js>. if already exists, failure to download will reuse existing file. - Profile: using `/profile` directory to contain the browser profile - Profile: downloaded to temp file, then placed into /downloads/profile.tar.gz. If failed to download, but already exists, existing /profile directory is used - Also fixes #897
This commit is contained in:
parent
1d15a155f2
commit
30646ca7ba
11 changed files with 436 additions and 134 deletions
|
|
@ -156,6 +156,8 @@ export class Crawler {
|
|||
warcCdxDir: string;
|
||||
indexesDir: string;
|
||||
|
||||
downloadsDir: string;
|
||||
|
||||
screenshotWriter: WARCWriter | null;
|
||||
textWriter: WARCWriter | null;
|
||||
|
||||
|
|
@ -289,6 +291,9 @@ export class Crawler {
|
|||
this.warcCdxDir = path.join(this.collDir, "warc-cdx");
|
||||
this.indexesDir = path.join(this.collDir, "indexes");
|
||||
|
||||
// download dirs
|
||||
this.downloadsDir = path.join(this.collDir, "downloads");
|
||||
|
||||
this.screenshotWriter = null;
|
||||
this.textWriter = null;
|
||||
|
||||
|
|
@ -307,7 +312,7 @@ export class Crawler {
|
|||
|
||||
this.customBehaviors = "";
|
||||
|
||||
this.browser = new Browser();
|
||||
this.browser = new Browser(this.collDir);
|
||||
}
|
||||
|
||||
protected parseArgs() {
|
||||
|
|
@ -503,6 +508,8 @@ export class Crawler {
|
|||
await fsp.mkdir(this.warcCdxDir, { recursive: true });
|
||||
}
|
||||
|
||||
await fsp.mkdir(this.downloadsDir, { recursive: true });
|
||||
|
||||
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
|
||||
logger.setExternalLogStream(this.logFH);
|
||||
|
||||
|
|
@ -514,7 +521,7 @@ export class Crawler {
|
|||
this.proxyServer = res.proxyServer;
|
||||
this.proxyPacUrl = res.proxyPacUrl;
|
||||
|
||||
this.seeds = await parseSeeds(this.params);
|
||||
this.seeds = await parseSeeds(this.downloadsDir, this.params);
|
||||
this.numOriginalSeeds = this.seeds.length;
|
||||
|
||||
logger.info("Seeds", this.seeds);
|
||||
|
|
@ -1015,7 +1022,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async loadCustomBehaviors(sources: string[]) {
|
||||
let str = "";
|
||||
|
||||
for (const { contents } of await collectCustomBehaviors(sources)) {
|
||||
for (const { contents } of await collectCustomBehaviors(
|
||||
this.downloadsDir,
|
||||
sources,
|
||||
)) {
|
||||
str += `self.__bx_behaviors.load(${contents});\n`;
|
||||
}
|
||||
|
||||
|
|
@ -1029,7 +1039,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return;
|
||||
}
|
||||
|
||||
for (const { path, contents } of await collectCustomBehaviors(sources)) {
|
||||
for (const { path, contents } of await collectCustomBehaviors(
|
||||
this.downloadsDir,
|
||||
sources,
|
||||
)) {
|
||||
await this.browser.checkScript(cdp, path, contents);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import fs from "fs";
|
||||
import os from "os";
|
||||
import http, { IncomingMessage, ServerResponse } from "http";
|
||||
|
||||
import readline from "readline";
|
||||
|
|
@ -203,7 +204,7 @@ async function main() {
|
|||
]);
|
||||
}
|
||||
|
||||
const browser = new Browser();
|
||||
const browser = new Browser(os.tmpdir());
|
||||
|
||||
await browser.launch({
|
||||
profileUrl: params.profile,
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
import * as child_process from "child_process";
|
||||
import fs from "fs";
|
||||
import fsp from "node:fs/promises";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
import { Readable } from "node:stream";
|
||||
import crypto from "crypto";
|
||||
|
||||
import os from "os";
|
||||
import path from "path";
|
||||
|
||||
import { formatErr, LogContext, logger } from "./logger.js";
|
||||
|
|
@ -31,6 +32,7 @@ import puppeteer, {
|
|||
import { Recorder } from "./recorder.js";
|
||||
import { timedRun } from "./timing.js";
|
||||
import assert from "node:assert";
|
||||
import { replaceDir } from "./file_reader.js";
|
||||
|
||||
type BtrixChromeOpts = {
|
||||
proxyServer?: string;
|
||||
|
|
@ -61,6 +63,7 @@ const BROWSER_HEIGHT_OFFSET = 81;
|
|||
|
||||
// ==================================================================
|
||||
export class Browser {
|
||||
downloadsDir: string;
|
||||
profileDir: string;
|
||||
customProfile = false;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
|
@ -81,12 +84,9 @@ export class Browser {
|
|||
screenHeight: number;
|
||||
screenWHRatio: number;
|
||||
|
||||
constructor() {
|
||||
this.profileDir = path.join(os.tmpdir(), "btrixProfile");
|
||||
if (fs.existsSync(this.profileDir)) {
|
||||
fs.rmSync(this.profileDir, { recursive: true, force: true });
|
||||
}
|
||||
fs.mkdirSync(this.profileDir);
|
||||
constructor(rootDir: string) {
|
||||
this.downloadsDir = path.join(rootDir, "downloads");
|
||||
this.profileDir = path.join(rootDir, "profile");
|
||||
|
||||
// must be provided, part of Dockerfile
|
||||
assert(process.env.GEOMETRY);
|
||||
|
|
@ -112,9 +112,7 @@ export class Browser {
|
|||
return;
|
||||
}
|
||||
|
||||
if (profileUrl) {
|
||||
this.customProfile = await this.loadProfile(profileUrl);
|
||||
}
|
||||
await this.installProfile(profileUrl);
|
||||
|
||||
this.swOpt = swOpt;
|
||||
|
||||
|
|
@ -190,61 +188,97 @@ export class Browser {
|
|||
}
|
||||
}
|
||||
|
||||
async loadProfile(profileFilename: string): Promise<boolean> {
|
||||
const targetFilename = path.join(os.tmpdir(), "profile.tar.gz");
|
||||
async installProfile(profileUrl: string) {
|
||||
await fsp.mkdir(this.profileDir, { recursive: true });
|
||||
|
||||
if (!profileUrl) {
|
||||
return;
|
||||
}
|
||||
|
||||
const profileTarGz = path.join(this.downloadsDir, "profile.tar.gz");
|
||||
|
||||
const exists = fs.existsSync(profileTarGz);
|
||||
|
||||
const suffix = crypto.randomBytes(4).toString("hex");
|
||||
|
||||
const tmpProfileDest = path.join(
|
||||
this.downloadsDir,
|
||||
`profile-${suffix}.tar.gz`,
|
||||
);
|
||||
const tmpProfileDir = path.join(this.downloadsDir, `profile-${suffix}`);
|
||||
|
||||
await fsp.mkdir(tmpProfileDir, { recursive: true });
|
||||
|
||||
try {
|
||||
await this.loadProfile(profileUrl, tmpProfileDest, tmpProfileDir);
|
||||
|
||||
// replace old profile dir with new profile dir
|
||||
await replaceDir(tmpProfileDir, this.profileDir, exists);
|
||||
|
||||
// replace old tarball with new tarball
|
||||
await fsp.rename(tmpProfileDest, profileTarGz);
|
||||
} catch (e) {
|
||||
if (exists) {
|
||||
logger.warn(
|
||||
"Error updating profile, using existing profile",
|
||||
formatErr(e),
|
||||
"browser",
|
||||
);
|
||||
} else {
|
||||
// remove the temp profile dir, likely empty
|
||||
await fsp.rm(tmpProfileDir, { recursive: true });
|
||||
logger.fatal("Profile setup failed", formatErr(e), "browser");
|
||||
}
|
||||
}
|
||||
this.customProfile = true;
|
||||
}
|
||||
|
||||
async loadProfile(
|
||||
profileRemoteSrc: string,
|
||||
profileLocalSrc: string,
|
||||
profileDir: string,
|
||||
) {
|
||||
if (
|
||||
profileFilename &&
|
||||
(profileFilename.startsWith("http:") ||
|
||||
profileFilename.startsWith("https:"))
|
||||
profileRemoteSrc &&
|
||||
(profileRemoteSrc.startsWith("http:") ||
|
||||
profileRemoteSrc.startsWith("https:"))
|
||||
) {
|
||||
logger.info(
|
||||
`Downloading ${profileFilename} to ${targetFilename}`,
|
||||
`Downloading ${profileRemoteSrc} to ${profileLocalSrc}`,
|
||||
{},
|
||||
"browser",
|
||||
);
|
||||
|
||||
const resp = await fetch(profileFilename);
|
||||
const resp = await fetch(profileRemoteSrc);
|
||||
await pipeline(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
Readable.fromWeb(resp.body as any),
|
||||
fs.createWriteStream(targetFilename),
|
||||
fs.createWriteStream(profileLocalSrc),
|
||||
);
|
||||
|
||||
profileFilename = targetFilename;
|
||||
} else if (profileFilename && profileFilename.startsWith("@")) {
|
||||
} else if (profileRemoteSrc && profileRemoteSrc.startsWith("@")) {
|
||||
const storage = initStorage();
|
||||
|
||||
if (!storage) {
|
||||
logger.fatal(
|
||||
throw new Error(
|
||||
"Profile specified relative to s3 storage, but no S3 storage defined",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
await storage.downloadFile(profileFilename.slice(1), targetFilename);
|
||||
|
||||
profileFilename = targetFilename;
|
||||
await storage.downloadFile(profileRemoteSrc.slice(1), profileLocalSrc);
|
||||
} else {
|
||||
await fsp.copyFile(profileRemoteSrc, profileLocalSrc);
|
||||
}
|
||||
|
||||
if (profileFilename) {
|
||||
try {
|
||||
child_process.execSync("tar xvfz " + profileFilename, {
|
||||
cwd: this.profileDir,
|
||||
});
|
||||
this.removeSingletons();
|
||||
return true;
|
||||
} catch (e) {
|
||||
logger.fatal(
|
||||
`Profile filename ${profileFilename} not a valid tar.gz, can not load profile, exiting`,
|
||||
{},
|
||||
"browser",
|
||||
);
|
||||
}
|
||||
try {
|
||||
child_process.execSync("tar xvfz " + profileLocalSrc, {
|
||||
cwd: profileDir,
|
||||
stdio: "ignore",
|
||||
});
|
||||
this.removeSingletons();
|
||||
} catch (e) {
|
||||
throw new Error(`Profile ${profileLocalSrc} not a valid tar.gz`);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
removeSingletons() {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
import fsp from "fs/promises";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import crypto from "crypto";
|
||||
import { fetch } from "undici";
|
||||
import util from "util";
|
||||
|
|
@ -25,39 +24,94 @@ export type FileSource = {
|
|||
export type FileSources = FileSource[];
|
||||
|
||||
async function getTempFile(
|
||||
targetDir: string,
|
||||
filename: string,
|
||||
dirPrefix: string,
|
||||
): Promise<string> {
|
||||
const tmpDir = path.join(
|
||||
os.tmpdir(),
|
||||
`${dirPrefix}-${crypto.randomBytes(4).toString("hex")}`,
|
||||
return path.join(
|
||||
targetDir,
|
||||
`${crypto.randomBytes(4).toString("hex")}-${filename}`,
|
||||
);
|
||||
await fsp.mkdir(tmpDir, { recursive: true });
|
||||
return path.join(tmpDir, filename);
|
||||
}
|
||||
|
||||
export async function replaceDir(
|
||||
sourceDir: string,
|
||||
destDir: string,
|
||||
exists: boolean,
|
||||
) {
|
||||
// Move new dir to new location
|
||||
try {
|
||||
if (exists) {
|
||||
await fsp.rm(destDir, { force: true, recursive: true });
|
||||
}
|
||||
//await exec(`mv ${sourceDir} ${destDir}`);
|
||||
await fsp.rename(sourceDir, destDir);
|
||||
} catch (e) {
|
||||
logger.fatal("Error moving/renaming directories, should not happen", {
|
||||
...formatErr(e),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function writeUrlContentsToFile(
|
||||
targetDir: string,
|
||||
url: string,
|
||||
pathPrefix: string,
|
||||
pathDefaultExt: string,
|
||||
useProxy: boolean = false,
|
||||
) {
|
||||
const res = await fetch(url, {
|
||||
dispatcher: useProxy ? getProxyDispatcher(url) : undefined,
|
||||
});
|
||||
const fileContents = await res.text();
|
||||
|
||||
fetchNew = false,
|
||||
useProxy = false,
|
||||
): Promise<string> {
|
||||
const filename =
|
||||
path.basename(new URL(url).pathname) || "index." + pathDefaultExt;
|
||||
const filepath = await getTempFile(filename, pathPrefix);
|
||||
|
||||
await fsp.writeFile(filepath, fileContents);
|
||||
return filepath;
|
||||
const targetFile = path.join(targetDir, pathPrefix + filename);
|
||||
let exists = false;
|
||||
|
||||
try {
|
||||
await fsp.access(targetFile, fsp.constants.R_OK | fsp.constants.W_OK);
|
||||
exists = true;
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
if (exists && !fetchNew) {
|
||||
return targetFile;
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
dispatcher: useProxy ? getProxyDispatcher(url) : undefined,
|
||||
});
|
||||
if (!res.ok) {
|
||||
throw new Error(`Invalid response, status: ${res.status}`);
|
||||
}
|
||||
const fileContents = await res.text();
|
||||
|
||||
const filepath = await getTempFile(targetDir, filename);
|
||||
|
||||
await fsp.writeFile(filepath, fileContents);
|
||||
|
||||
await fsp.rename(filepath, targetFile);
|
||||
} catch (e) {
|
||||
if (!exists) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
return targetFile;
|
||||
}
|
||||
|
||||
export async function collectOnlineSeedFile(url: string): Promise<string> {
|
||||
export async function collectOnlineSeedFile(
|
||||
targetDir: string,
|
||||
url: string,
|
||||
): Promise<string> {
|
||||
try {
|
||||
const filepath = await writeUrlContentsToFile(url, "seeds-", ".txt");
|
||||
const filepath = await writeUrlContentsToFile(
|
||||
targetDir,
|
||||
url,
|
||||
"seeds-",
|
||||
".txt",
|
||||
false,
|
||||
false,
|
||||
);
|
||||
logger.info("Seed file downloaded", { url, path: filepath });
|
||||
return filepath;
|
||||
} catch (e) {
|
||||
|
|
@ -70,16 +124,17 @@ export async function collectOnlineSeedFile(url: string): Promise<string> {
|
|||
}
|
||||
|
||||
export async function collectCustomBehaviors(
|
||||
targetDir: string,
|
||||
sources: string[],
|
||||
): Promise<FileSources> {
|
||||
const collectedSources: FileSources = [];
|
||||
|
||||
for (const fileSource of sources) {
|
||||
if (fileSource.startsWith("git+")) {
|
||||
const newSources = await collectGitBehaviors(fileSource);
|
||||
const newSources = await collectGitBehaviors(targetDir, fileSource);
|
||||
collectedSources.push(...newSources);
|
||||
} else if (fileSource.startsWith("http")) {
|
||||
const newSources = await collectOnlineBehavior(fileSource);
|
||||
const newSources = await collectOnlineBehavior(targetDir, fileSource);
|
||||
collectedSources.push(...newSources);
|
||||
} else {
|
||||
const newSources = await collectLocalPathBehaviors(fileSource);
|
||||
|
|
@ -90,16 +145,32 @@ export async function collectCustomBehaviors(
|
|||
return collectedSources;
|
||||
}
|
||||
|
||||
async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
|
||||
async function collectGitBehaviors(
|
||||
targetDir: string,
|
||||
gitUrl: string,
|
||||
): Promise<FileSources> {
|
||||
const url = gitUrl.split("git+").pop() || "";
|
||||
const params = new URL(url).searchParams;
|
||||
const branch = params.get("branch") || "";
|
||||
const relPath = params.get("path") || "";
|
||||
const urlStripped = url.split("?")[0];
|
||||
|
||||
const urlHash = crypto.createHash("sha-256").update(url).digest("hex");
|
||||
|
||||
const behaviorsDir = path.join(targetDir, `behaviors-repo-${urlHash}`);
|
||||
|
||||
let exists = false;
|
||||
|
||||
try {
|
||||
await fsp.access(behaviorsDir);
|
||||
exists = true;
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
const tmpDir = path.join(
|
||||
os.tmpdir(),
|
||||
`behaviors-repo-${crypto.randomBytes(4).toString("hex")}`,
|
||||
targetDir,
|
||||
`behaviors-repo-${urlHash}-${crypto.randomBytes(4).toString("hex")}`,
|
||||
);
|
||||
|
||||
let cloneCommand = "git clone ";
|
||||
|
|
@ -113,6 +184,7 @@ async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
|
|||
pathToCollect = path.join(tmpDir, relPath);
|
||||
}
|
||||
|
||||
// Download behaviors to temp dir (in downloads directory)
|
||||
try {
|
||||
await exec(cloneCommand);
|
||||
logger.info(
|
||||
|
|
@ -120,23 +192,45 @@ async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
|
|||
{ url: urlStripped },
|
||||
"behavior",
|
||||
);
|
||||
return await collectLocalPathBehaviors(pathToCollect);
|
||||
} catch (e) {
|
||||
logger.fatal(
|
||||
"Error downloading custom behaviors from Git repo",
|
||||
{ url: urlStripped, ...formatErr(e) },
|
||||
"behavior",
|
||||
);
|
||||
if (!exists) {
|
||||
logger.fatal(
|
||||
"Error downloading custom behaviors from Git repo",
|
||||
{ url: urlStripped, ...formatErr(e) },
|
||||
"behavior",
|
||||
);
|
||||
} else {
|
||||
logger.info(
|
||||
"Error re-downloading custom behaviors from Git repo, using existing behaviors",
|
||||
{ url: urlStripped, ...formatErr(e) },
|
||||
"behavior",
|
||||
);
|
||||
return await collectLocalPathBehaviors(behaviorsDir);
|
||||
}
|
||||
}
|
||||
return [];
|
||||
|
||||
await replaceDir(pathToCollect, behaviorsDir, exists);
|
||||
|
||||
// remove the rest of the repo that we're not using
|
||||
if (relPath) {
|
||||
await fsp.rm(tmpDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
return await collectLocalPathBehaviors(behaviorsDir);
|
||||
}
|
||||
|
||||
async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
||||
async function collectOnlineBehavior(
|
||||
targetDir: string,
|
||||
url: string,
|
||||
): Promise<FileSources> {
|
||||
try {
|
||||
const behaviorFilepath = await writeUrlContentsToFile(
|
||||
targetDir,
|
||||
url,
|
||||
"behaviors-",
|
||||
".js",
|
||||
true,
|
||||
false,
|
||||
);
|
||||
logger.info(
|
||||
"Custom behavior file downloaded",
|
||||
|
|
|
|||
|
|
@ -304,7 +304,10 @@ export class ScopedSeed {
|
|||
}
|
||||
}
|
||||
|
||||
export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
|
||||
export async function parseSeeds(
|
||||
targetDir: string,
|
||||
params: CrawlerArgs,
|
||||
): Promise<ScopedSeed[]> {
|
||||
let seeds = params.seeds as string[];
|
||||
const scopedSeeds: ScopedSeed[] = [];
|
||||
|
||||
|
|
@ -314,7 +317,7 @@ export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
|
|||
seedFilePath.startsWith("http://") ||
|
||||
seedFilePath.startsWith("https://")
|
||||
) {
|
||||
seedFilePath = await collectOnlineSeedFile(seedFilePath);
|
||||
seedFilePath = await collectOnlineSeedFile(targetDir, seedFilePath);
|
||||
}
|
||||
|
||||
const urlSeedFile = fs.readFileSync(seedFilePath, "utf8");
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ import getFolderSize from "get-folder-size";
|
|||
|
||||
import { WACZ } from "./wacz.js";
|
||||
import { sleep, timedRun } from "./timing.js";
|
||||
import { DEFAULT_MAX_RETRIES, ExitCodes } from "./constants.js";
|
||||
import { DEFAULT_MAX_RETRIES } from "./constants.js";
|
||||
|
||||
const DEFAULT_REGION = "us-east-1";
|
||||
|
||||
|
|
@ -176,12 +176,7 @@ export class S3StorageSync {
|
|||
await sleep(5);
|
||||
logger.warn("Retry downloading profile", {}, "storage");
|
||||
} else {
|
||||
logger.fatal(
|
||||
"Could not download profile, exiting",
|
||||
{},
|
||||
"storage",
|
||||
ExitCodes.Failed,
|
||||
);
|
||||
throw new Error("Profile could not be downloaded");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,9 +7,11 @@ test("ensure dryRun crawl only writes pages and logs", async () => {
|
|||
);
|
||||
|
||||
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
|
||||
expect(files.length).toBe(2);
|
||||
expect(files[0]).toBe("logs");
|
||||
expect(files[1]).toBe("pages");
|
||||
expect(files.length).toBe(4);
|
||||
expect(files[0]).toBe("downloads");
|
||||
expect(files[1]).toBe("logs");
|
||||
expect(files[2]).toBe("pages");
|
||||
expect(files[3]).toBe("profile");
|
||||
});
|
||||
|
||||
|
||||
|
|
|
|||
BIN
tests/fixtures/sample-profile.tar.gz
vendored
Normal file
BIN
tests/fixtures/sample-profile.tar.gz
vendored
Normal file
Binary file not shown.
87
tests/profiles.test.js
Normal file
87
tests/profiles.test.js
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
import { execSync } from "child_process";
|
||||
import fs from "node:fs";
|
||||
|
||||
|
||||
test("run with invalid profile, fail", async () => {
|
||||
let status = 0;
|
||||
try {
|
||||
await execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection profile-0 --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --limit 1 --profile /tests/fixtures/invalid.tar.gz",
|
||||
);
|
||||
} catch (error) {
|
||||
status = error.status;
|
||||
}
|
||||
|
||||
expect(status).toBe(17);
|
||||
});
|
||||
|
||||
test("start with no profile", async () => {
|
||||
let status = 0;
|
||||
try {
|
||||
await execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection profile-1 --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --limit 1",
|
||||
);
|
||||
} catch (error) {
|
||||
status = error.status;
|
||||
}
|
||||
|
||||
expect(status).toBe(0);
|
||||
});
|
||||
|
||||
test("resume same crawl, but with invalid profile, not valid as no previous valid profile", async () => {
|
||||
let status = 0;
|
||||
try {
|
||||
await execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection profile-1 --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --limit 1 --profile /tests/fixtures/invalid.tar.gz",
|
||||
);
|
||||
} catch (error) {
|
||||
status = error.status;
|
||||
}
|
||||
|
||||
expect(status).toBe(17);
|
||||
});
|
||||
|
||||
|
||||
test("start with valid profile", async () => {
|
||||
let status = 0;
|
||||
try {
|
||||
await execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection profile-2 --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --limit 1 --scopeType page --profile /tests/fixtures/sample-profile.tar.gz",
|
||||
);
|
||||
} catch (error) {
|
||||
status = error.status;
|
||||
}
|
||||
|
||||
expect(status).toBe(0);
|
||||
|
||||
let crawled_pages = fs.readFileSync(
|
||||
"test-crawls/collections/profile-2/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
|
||||
// crawled only one page (+ header)
|
||||
expect(crawled_pages.split("\n").length === 2);
|
||||
});
|
||||
|
||||
|
||||
test("resume same crawl, ignore invalid profile, use existing, finish crawl", async () => {
|
||||
let status = 0;
|
||||
try {
|
||||
await execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection profile-2 --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --scopeType page --profile /tests/fixtures/invalid.tar.gz",
|
||||
);
|
||||
} catch (error) {
|
||||
status = error.status;
|
||||
}
|
||||
|
||||
expect(status).toBe(0);
|
||||
|
||||
let crawled_pages = fs.readFileSync(
|
||||
"test-crawls/collections/profile-1/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
|
||||
// crawled 3 pages
|
||||
expect(crawled_pages.split("\n").length === 4);
|
||||
});
|
||||
|
||||
|
|
@ -14,7 +14,7 @@ async function getSeeds(config) {
|
|||
};
|
||||
|
||||
const params = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
|
||||
return await parseSeeds(params);
|
||||
return await parseSeeds("", params);
|
||||
}
|
||||
|
||||
test("default scope", async () => {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
import util from "util";
|
||||
import { spawn, exec as execCallback } from "child_process";
|
||||
import fs from "fs";
|
||||
import os from "os";
|
||||
import path from "path";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
|
|
@ -9,17 +11,54 @@ let proc = null;
|
|||
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
|
||||
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`;
|
||||
|
||||
const fixtures = path.join("tests", "fixtures");
|
||||
const seedFileCopy = path.join(fixtures, "seedFileCopy.txt");
|
||||
|
||||
beforeAll(() => {
|
||||
proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: "tests/fixtures/"});
|
||||
fs.copyFileSync(path.join(fixtures, "urlSeedFile.txt"), seedFileCopy);
|
||||
|
||||
proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: fixtures});
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
if (proc) {
|
||||
proc.kill();
|
||||
proc = null;
|
||||
}
|
||||
fs.unlinkSync(seedFileCopy);
|
||||
});
|
||||
|
||||
|
||||
function verifyAllSeedsCrawled(collName, hasDownload) {
|
||||
let crawled_pages = fs.readFileSync(
|
||||
`test-crawls/collections/${collName}/pages/pages.jsonl`,
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const seedFile = hasDownload ? `test-crawls/collections/${collName}/downloads/seeds-seedFileCopy.txt` : "tests/fixtures/urlSeedFile.txt";
|
||||
let seed_file = fs
|
||||
.readFileSync(seedFile, "utf8")
|
||||
.split("\n")
|
||||
.sort();
|
||||
|
||||
let seed_file_list = [];
|
||||
for (var j = 0; j < seed_file.length; j++) {
|
||||
if (seed_file[j] != undefined) {
|
||||
seed_file_list.push(seed_file[j]);
|
||||
}
|
||||
}
|
||||
|
||||
let foundSeedUrl = true;
|
||||
|
||||
for (var i = 1; i < seed_file_list.length; i++) {
|
||||
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
|
||||
foundSeedUrl = false;
|
||||
}
|
||||
}
|
||||
expect(foundSeedUrl).toBe(true);
|
||||
}
|
||||
|
||||
|
||||
|
||||
test("check that URLs in seed-list are crawled", async () => {
|
||||
try {
|
||||
|
|
@ -30,64 +69,98 @@ test("check that URLs in seed-list are crawled", async () => {
|
|||
console.log(error);
|
||||
}
|
||||
|
||||
let crawled_pages = fs.readFileSync(
|
||||
"test-crawls/collections/filelisttest/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
let seed_file = fs
|
||||
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
|
||||
.split("\n")
|
||||
.sort();
|
||||
|
||||
let seed_file_list = [];
|
||||
for (var j = 0; j < seed_file.length; j++) {
|
||||
if (seed_file[j] != undefined) {
|
||||
seed_file_list.push(seed_file[j]);
|
||||
}
|
||||
}
|
||||
|
||||
let foundSeedUrl = true;
|
||||
|
||||
for (var i = 1; i < seed_file_list.length; i++) {
|
||||
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
|
||||
foundSeedUrl = false;
|
||||
}
|
||||
}
|
||||
expect(foundSeedUrl).toBe(true);
|
||||
verifyAllSeedsCrawled("filelisttest", false);
|
||||
});
|
||||
|
||||
|
||||
test("check that URLs in seed-list hosted at URL are crawled", async () => {
|
||||
try {
|
||||
await exec(
|
||||
`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "${TEST_HOST}/urlSeedFile.txt" --timeout 90000 --scopeType page`,
|
||||
`docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "${TEST_HOST}/seedFileCopy.txt" --timeout 90000 --scopeType page`,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
verifyAllSeedsCrawled("onlinefilelisttest", true);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("start crawl, interrupt, remove seed file, and ensure all seed URLs are crawled", async () => {
|
||||
try {
|
||||
await exec(
|
||||
`docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection seed-file-removed --urlFile "${TEST_HOST}/seedFileCopy.txt" --timeout 90000 --scopeType page --limit 1`,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
let crawled_pages = fs.readFileSync(
|
||||
"test-crawls/collections/onlinefilelisttest/pages/pages.jsonl",
|
||||
"test-crawls/collections/seed-file-removed/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
let seed_file = fs
|
||||
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
|
||||
.split("\n")
|
||||
.sort();
|
||||
|
||||
let seed_file_list = [];
|
||||
for (var j = 0; j < seed_file.length; j++) {
|
||||
if (seed_file[j] != undefined) {
|
||||
seed_file_list.push(seed_file[j]);
|
||||
expect(crawled_pages.split("\n").length === 2);
|
||||
|
||||
try {
|
||||
// move file so server returns 404
|
||||
fs.renameSync(seedFileCopy, seedFileCopy + ".bak");
|
||||
|
||||
// server no longer up
|
||||
try {
|
||||
const res = await fetch("http://localhost:31502/");
|
||||
expect(res.status).toBe(404);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
// restart crawl, but with invalid seed list now
|
||||
await exec(
|
||||
`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-removed --urlFile "${TEST_HOST}/seedFileCopy.txt" --timeout 90000 --scopeType page`,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
} finally {
|
||||
// move back
|
||||
fs.renameSync(seedFileCopy + ".bak", seedFileCopy);
|
||||
}
|
||||
|
||||
let foundSeedUrl = true;
|
||||
|
||||
for (var i = 1; i < seed_file_list.length; i++) {
|
||||
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
|
||||
foundSeedUrl = false;
|
||||
}
|
||||
}
|
||||
expect(foundSeedUrl).toBe(true);
|
||||
verifyAllSeedsCrawled("seed-file-removed", true);
|
||||
});
|
||||
|
||||
|
||||
test("start crawl, interrupt, stop seed file server, and ensure all seed URLs are crawled", async () => {
|
||||
try {
|
||||
await exec(
|
||||
`docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection seed-file-server-gone --urlFile "${TEST_HOST}/seedFileCopy.txt" --timeout 90000 --scopeType page --limit 1`,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
let crawled_pages = fs.readFileSync(
|
||||
"test-crawls/collections/seed-file-server-gone/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
|
||||
expect(crawled_pages.split("\n").length === 2);
|
||||
|
||||
// kill server that serves the seed list
|
||||
proc.kill();
|
||||
|
||||
// server no longer up
|
||||
await expect(() => fetch("http://localhost:31502/")).rejects.toThrow("fetch failed");
|
||||
|
||||
// restart crawl, but with invalid seed list now
|
||||
try {
|
||||
await exec(
|
||||
`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-server-gone --urlFile "${TEST_HOST}/seedFileCopy.txt" --timeout 90000 --scopeType page`,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
verifyAllSeedsCrawled("seed-file-server-gone", true);
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue