mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Fixes from 1.0.3 release -> main (#517)
sitemap improvements: gz support + application/xml + extraHops fix #511 - follow up to https://github.com/webrecorder/browsertrix-crawler/issues/496 - support parsing sitemap urls that end in .gz with gzip decompression - support both `application/xml` and `text/xml` as valid sitemap content-types (add test for both) - ignore extraHops for sitemap found URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be treated as links from seed page) fixes redirected seed (from #476) being counted against page limit: #509 - subtract extraSeeds when computing limit - don't include redirect seeds in seen list when serializing - tests: adjust saved-state-test to also check total pages when crawl is done fixes #508
This commit is contained in:
parent
6b6cb4137a
commit
01c4139aa7
5 changed files with 97 additions and 38 deletions
|
@ -2152,6 +2152,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
let finished = false;
|
||||
|
||||
// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
|
||||
// otherwise, all sitemap found URLs would be eligible for additional hops
|
||||
const extraHopsDisabled = this.params.extraHops + 1;
|
||||
|
||||
await new Promise<void>((resolve) => {
|
||||
sitemapper.on("end", () => {
|
||||
resolve();
|
||||
|
@ -2165,6 +2169,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
finished = true;
|
||||
}
|
||||
});
|
||||
|
||||
sitemapper.on("url", ({ url }) => {
|
||||
const count = sitemapper.count;
|
||||
if (count % 10 ** power === 0) {
|
||||
|
@ -2178,7 +2183,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"sitemap",
|
||||
);
|
||||
}
|
||||
this.queueInScopeUrls(seedId, [url], 0);
|
||||
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
|
||||
if (count >= 100 && !resolved) {
|
||||
logger.info(
|
||||
"Sitemap partially parsed, continue parsing large sitemap in the background",
|
||||
|
|
|
@ -11,6 +11,9 @@ import { sleep } from "./timing.js";
|
|||
|
||||
const SITEMAP_CONCURRENCY = 5;
|
||||
|
||||
const TEXT_CONTENT_TYPE = ["text/plain"];
|
||||
const XML_CONTENT_TYPES = ["text/xml", "application/xml"];
|
||||
|
||||
export type SitemapOpts = {
|
||||
headers?: Record<string, string>;
|
||||
|
||||
|
@ -83,7 +86,7 @@ export class SitemapReader extends EventEmitter {
|
|||
}
|
||||
}
|
||||
|
||||
async tryFetch(url: string, expectedCT?: string | null) {
|
||||
async tryFetch(url: string, expectedCT?: string[] | null) {
|
||||
try {
|
||||
logger.debug(
|
||||
"Detecting Sitemap: fetching",
|
||||
|
@ -101,7 +104,7 @@ export class SitemapReader extends EventEmitter {
|
|||
}
|
||||
|
||||
const ct = resp.headers.get("content-type");
|
||||
if (expectedCT && ct && ct.split(";")[0] != expectedCT) {
|
||||
if (expectedCT && ct && !expectedCT.includes(ct.split(";")[0])) {
|
||||
logger.debug(
|
||||
"Detecting Sitemap: invalid content-type",
|
||||
{ ct },
|
||||
|
@ -129,12 +132,12 @@ export class SitemapReader extends EventEmitter {
|
|||
if (sitemap === DETECT_SITEMAP) {
|
||||
logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap");
|
||||
fullUrl = new URL("/robots.txt", seedUrl).href;
|
||||
resp = await this.tryFetch(fullUrl, "text/plain");
|
||||
resp = await this.tryFetch(fullUrl, TEXT_CONTENT_TYPE);
|
||||
if (resp) {
|
||||
isRobots = true;
|
||||
} else {
|
||||
fullUrl = new URL("/sitemap.xml", seedUrl).href;
|
||||
resp = await this.tryFetch(fullUrl, "text/xml");
|
||||
resp = await this.tryFetch(fullUrl, XML_CONTENT_TYPES);
|
||||
if (resp) {
|
||||
isSitemap = true;
|
||||
}
|
||||
|
@ -144,10 +147,10 @@ export class SitemapReader extends EventEmitter {
|
|||
fullUrl = new URL(sitemap, seedUrl).href;
|
||||
let expected = null;
|
||||
if (fullUrl.endsWith(".xml")) {
|
||||
expected = "text/xml";
|
||||
expected = XML_CONTENT_TYPES;
|
||||
isSitemap = true;
|
||||
} else if (fullUrl.endsWith(".txt")) {
|
||||
expected = "text/plain";
|
||||
expected = TEXT_CONTENT_TYPE;
|
||||
isRobots = true;
|
||||
}
|
||||
resp = await this.tryFetch(fullUrl, expected);
|
||||
|
@ -218,8 +221,22 @@ export class SitemapReader extends EventEmitter {
|
|||
}
|
||||
|
||||
private async _parseSitemapFromResponse(url: string, resp: Response) {
|
||||
let stream;
|
||||
|
||||
const { body } = resp;
|
||||
if (!body) {
|
||||
throw new Error("missing response body");
|
||||
}
|
||||
// decompress .gz sitemaps
|
||||
if (url.endsWith(".gz")) {
|
||||
const ds = new DecompressionStream("gzip");
|
||||
stream = body.pipeThrough(ds);
|
||||
} else {
|
||||
stream = body;
|
||||
}
|
||||
|
||||
const readableNodeStream = Readable.fromWeb(
|
||||
resp.body as ReadableStream<Uint8Array>,
|
||||
stream as ReadableStream<Uint8Array>,
|
||||
);
|
||||
this.initSaxParser(url, readableNodeStream);
|
||||
}
|
||||
|
@ -244,6 +261,8 @@ export class SitemapReader extends EventEmitter {
|
|||
let currUrl: string | null;
|
||||
let lastmod: Date | null = null;
|
||||
|
||||
let errCount = 0;
|
||||
|
||||
let otherTags = 0;
|
||||
|
||||
parserStream.on("end", async () => {
|
||||
|
@ -358,10 +377,16 @@ export class SitemapReader extends EventEmitter {
|
|||
this.pending.delete(url);
|
||||
return;
|
||||
}
|
||||
logger.warn("Sitemap error parsing XML", { err }, "sitemap");
|
||||
logger.warn(
|
||||
"Sitemap error parsing XML",
|
||||
{ url, err, errCount },
|
||||
"sitemap",
|
||||
);
|
||||
if (errCount++ < 3) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(parserStream._parser as any).error = null;
|
||||
parserStream._parser.resume();
|
||||
}
|
||||
});
|
||||
|
||||
sourceStream.pipe(parserStream);
|
||||
|
|
|
@ -97,6 +97,7 @@ declare module "ioredis" {
|
|||
pkey: string,
|
||||
qkey: string,
|
||||
skey: string,
|
||||
esKey: string,
|
||||
url: string,
|
||||
score: number,
|
||||
data: string,
|
||||
|
@ -199,9 +200,9 @@ export class RedisCrawlState {
|
|||
|
||||
_initLuaCommands(redis: Redis) {
|
||||
redis.defineCommand("addqueue", {
|
||||
numberOfKeys: 3,
|
||||
numberOfKeys: 4,
|
||||
lua: `
|
||||
local size = redis.call('scard', KEYS[3]);
|
||||
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]);
|
||||
local limit = tonumber(ARGV[4]);
|
||||
if limit > 0 and size >= limit then
|
||||
return 1;
|
||||
|
@ -506,6 +507,7 @@ return 0;
|
|||
this.pkey,
|
||||
this.qkey,
|
||||
this.skey,
|
||||
this.esKey,
|
||||
url,
|
||||
this._getScore(data),
|
||||
JSON.stringify(data),
|
||||
|
@ -624,7 +626,8 @@ return 0;
|
|||
|
||||
for (const result of someResults) {
|
||||
const json = JSON.parse(result);
|
||||
seenSet.delete(json.url);
|
||||
//for extra seeds
|
||||
seenSet.delete(json.url || json.newUrl);
|
||||
results.push(result);
|
||||
}
|
||||
}
|
||||
|
@ -722,10 +725,6 @@ return 0;
|
|||
return parseInt(done || "0");
|
||||
}
|
||||
|
||||
async numSeen() {
|
||||
return await this.redis.scard(this.skey);
|
||||
}
|
||||
|
||||
async numPending() {
|
||||
const res = await this.redis.hlen(this.pkey);
|
||||
|
||||
|
|
|
@ -4,17 +4,15 @@ import path from "path";
|
|||
import yaml from "js-yaml";
|
||||
import Redis from "ioredis";
|
||||
|
||||
|
||||
const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";
|
||||
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function waitContainer(containerId) {
|
||||
try {
|
||||
execSync(`docker kill -s SIGINT ${containerId}`);
|
||||
} catch (e) {
|
||||
return;
|
||||
}
|
||||
|
||||
async function waitContainerDone(containerId) {
|
||||
// containerId is initially the full id, but docker ps
|
||||
// only prints the short id (first 12 characters)
|
||||
containerId = containerId.slice(0, 12);
|
||||
|
@ -32,6 +30,17 @@ async function waitContainer(containerId) {
|
|||
}
|
||||
}
|
||||
|
||||
async function killContainer(containerId) {
|
||||
try {
|
||||
execSync(`docker kill -s SIGINT ${containerId}`);
|
||||
} catch (e) {
|
||||
return;
|
||||
}
|
||||
|
||||
await waitContainerDone(containerId);
|
||||
}
|
||||
|
||||
|
||||
let savedStateFile;
|
||||
let state;
|
||||
let numDone;
|
||||
|
@ -43,7 +52,7 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
|
||||
try {
|
||||
containerId = execSync(
|
||||
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10",
|
||||
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"",
|
||||
{ encoding: "utf-8" },
|
||||
//wait.callback,
|
||||
);
|
||||
|
@ -51,8 +60,6 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
console.log(error);
|
||||
}
|
||||
|
||||
const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";
|
||||
|
||||
// remove existing pagesFile to support reentrancy
|
||||
try {
|
||||
fs.unlinkSync(pagesFile);
|
||||
|
@ -77,7 +84,7 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
await sleep(500);
|
||||
}
|
||||
|
||||
await waitContainer(containerId);
|
||||
await killContainer(containerId);
|
||||
|
||||
const savedStates = fs.readdirSync(
|
||||
"test-crawls/collections/int-state-test/crawls",
|
||||
|
@ -97,11 +104,13 @@ test("check parsing saved state + page done + queue present", () => {
|
|||
|
||||
const saved = yaml.load(savedState);
|
||||
|
||||
expect(!!saved.state).toBe(true);
|
||||
state = saved.state;
|
||||
numDone = state.finished.length;
|
||||
finished = state.finished;
|
||||
|
||||
numDone = finished.length;
|
||||
numQueued = state.queued.length;
|
||||
|
||||
expect(!!state).toBe(true);
|
||||
expect(numDone > 0).toEqual(true);
|
||||
expect(numQueued > 0).toEqual(true);
|
||||
expect(numDone + numQueued).toEqual(10);
|
||||
|
@ -110,8 +119,6 @@ test("check parsing saved state + page done + queue present", () => {
|
|||
expect(state.extraSeeds).toEqual([
|
||||
`{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`,
|
||||
]);
|
||||
|
||||
finished = state.finished;
|
||||
});
|
||||
|
||||
test("check crawl restarted with saved state", async () => {
|
||||
|
@ -121,7 +128,7 @@ test("check crawl restarted with saved state", async () => {
|
|||
|
||||
try {
|
||||
containerId = execSync(
|
||||
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
|
||||
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`,
|
||||
{ encoding: "utf-8" },
|
||||
);
|
||||
} catch (error) {
|
||||
|
@ -148,6 +155,16 @@ test("check crawl restarted with saved state", async () => {
|
|||
} catch (e) {
|
||||
console.log(e);
|
||||
} finally {
|
||||
await waitContainer(containerId);
|
||||
await waitContainerDone(containerId);
|
||||
}
|
||||
});
|
||||
|
||||
test("ensure correct number of pages was written", () => {
|
||||
const pages = fs
|
||||
.readFileSync(pagesFile, { encoding: "utf-8" })
|
||||
.trim()
|
||||
.split("\n");
|
||||
|
||||
// first line is the header
|
||||
expect(pages.length).toBe(10 + 1);
|
||||
});
|
||||
|
|
|
@ -29,8 +29,8 @@ async function waitContainer(containerId) {
|
|||
}
|
||||
}
|
||||
|
||||
async function runCrawl(numExpected, url, sitemap="", limit=0) {
|
||||
const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
|
||||
async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
|
||||
const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});
|
||||
|
||||
await sleep(3000);
|
||||
|
||||
|
@ -57,6 +57,10 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) {
|
|||
}
|
||||
|
||||
expect(finished).toBeGreaterThanOrEqual(numExpected);
|
||||
|
||||
if (numExpectedLessThan) {
|
||||
expect(finished).toBeLessThanOrEqual(numExpectedLessThan);
|
||||
}
|
||||
}
|
||||
|
||||
test("test sitemap fully finish", async () => {
|
||||
|
@ -70,3 +74,12 @@ test("test sitemap with limit", async () => {
|
|||
test("test sitemap with limit, specific URL", async () => {
|
||||
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
|
||||
});
|
||||
|
||||
test("test sitemap with application/xml content-type", async () => {
|
||||
await runCrawl(10, "https://bitarchivist.net/", "", 0);
|
||||
});
|
||||
|
||||
|
||||
test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
|
||||
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue