mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-07 13:49:47 +00:00
Add option to respect robots.txt disallows (#888)
Fixes #631 - Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler. - Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x' - Robots.txt bodies are parsed and checked for page allow/disallow status using the https://github.com/samclarke/robots-parser library, which is the most active and well-maintained implementation I could find with TypeScript types. - Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K - Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all' - Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
75a0c9a305
commit
1d15a155f2
9 changed files with 247 additions and 5 deletions
|
|
@ -103,16 +103,16 @@ Options:
|
|||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
|
||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
|
||||
[default: []]
|
||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
|
||||
"robots"] [default: []]
|
||||
--logExcludeContext Comma-separated list of contexts to
|
||||
NOT include in logs
|
||||
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
|
||||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
|
||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
|
||||
[default: ["recorderNetwork","jsError","screencast"]]
|
||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
|
||||
"robots"] [default: ["recorderNetwork","jsError","screencast"]]
|
||||
--text Extract initial (default) or final t
|
||||
ext to pages.jsonl or WARC resource
|
||||
record(s)
|
||||
|
|
@ -324,6 +324,12 @@ Options:
|
|||
the Chrome instance (space-separated
|
||||
or multiple --extraChromeArgs)
|
||||
[array] [default: []]
|
||||
--robots If set, fetch and respect page disal
|
||||
lows specified in per-host robots.tx
|
||||
t [boolean] [default: false]
|
||||
--robotsAgent Agent to check in addition to '*' fo
|
||||
r robots rules
|
||||
[string] [default: "Browsertrix/1.x"]
|
||||
--config Path to YAML config file
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@
|
|||
"pixelmatch": "^5.3.0",
|
||||
"pngjs": "^7.0.0",
|
||||
"puppeteer-core": "^24.30.0",
|
||||
"robots-parser": "^3.0.1",
|
||||
"sax": "^1.3.0",
|
||||
"sharp": "^0.32.6",
|
||||
"tsc": "^2.0.4",
|
||||
|
|
|
|||
|
|
@ -72,6 +72,7 @@ import {
|
|||
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
|
||||
import { initProxy } from "./util/proxy.js";
|
||||
import { initFlow, nextFlowStep } from "./util/flowbehavior.js";
|
||||
import { isDisallowedByRobots, setRobotsConfig } from "./util/robots.js";
|
||||
|
||||
const btrixBehaviors = fs.readFileSync(
|
||||
new URL(
|
||||
|
|
@ -547,6 +548,10 @@ export class Crawler {
|
|||
|
||||
this.headers = { "User-Agent": this.configureUA() };
|
||||
|
||||
if (this.params.robots) {
|
||||
setRobotsConfig(this.headers, this.crawlState);
|
||||
}
|
||||
|
||||
process.on("exit", () => {
|
||||
for (const proc of subprocesses) {
|
||||
proc.kill();
|
||||
|
|
@ -2506,6 +2511,18 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return false;
|
||||
}
|
||||
|
||||
if (
|
||||
this.params.robots &&
|
||||
(await isDisallowedByRobots(url, logDetails, this.params.robotsAgent))
|
||||
) {
|
||||
logger.debug(
|
||||
"Page URL not queued, disallowed by robots.txt",
|
||||
{ url, ...logDetails },
|
||||
"links",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
const result = await this.crawlState.addToQueue(
|
||||
{ url, seedId, depth, extraHops, ts, pageid },
|
||||
this.pageLimit,
|
||||
|
|
|
|||
|
|
@ -704,6 +704,19 @@ class ArgParser {
|
|||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
robots: {
|
||||
describe:
|
||||
"If set, fetch and respect page disallows specified in per-host robots.txt",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
robotsAgent: {
|
||||
describe: "Agent to check in addition to '*' for robots rules",
|
||||
type: "string",
|
||||
default: "Browsertrix/1.x",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
|||
export const PAGE_OP_TIMEOUT_SECS = 5;
|
||||
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
||||
|
||||
export const ROBOTS_CACHE_LIMIT = 100;
|
||||
|
||||
export type ExtractSelector = {
|
||||
selector: string;
|
||||
extract: string;
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
|
|||
"replay",
|
||||
"proxy",
|
||||
"scope",
|
||||
"robots",
|
||||
] as const;
|
||||
|
||||
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
|
||||
|
|
|
|||
124
src/util/robots.ts
Normal file
124
src/util/robots.ts
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
import { fetch } from "undici";
|
||||
import robotsParser, { Robot } from "robots-parser";
|
||||
|
||||
import { LogDetails, logger } from "./logger.js";
|
||||
import { RedisCrawlState } from "./state.js";
|
||||
import { getProxyDispatcher } from "./proxy.js";
|
||||
import { timedRun } from "./timing.js";
|
||||
|
||||
let headers: Record<string, string> = {};
|
||||
let crawlState: RedisCrawlState | null = null;
|
||||
|
||||
const pendingFetches: Map<string, Promise<string>> = new Map<
|
||||
string,
|
||||
Promise<string>
|
||||
>();
|
||||
|
||||
// max seconds to wait to fetch robots
|
||||
const ROBOTS_FETCH_TIMEOUT = 10;
|
||||
|
||||
export function setRobotsConfig(
|
||||
_headers: Record<string, string>,
|
||||
state: RedisCrawlState,
|
||||
) {
|
||||
headers = _headers;
|
||||
crawlState = state;
|
||||
}
|
||||
|
||||
export async function isDisallowedByRobots(
|
||||
url: string,
|
||||
logDetails: LogDetails,
|
||||
robotsAgent: string,
|
||||
) {
|
||||
const robots = await fetchAndParseRobots(url, logDetails);
|
||||
return robots && robots.isDisallowed(url, robotsAgent);
|
||||
}
|
||||
|
||||
async function fetchAndParseRobots(
|
||||
url: string,
|
||||
logDetails: LogDetails,
|
||||
): Promise<Robot | null> {
|
||||
// Fetch robots.txt for url's host and return parser.
|
||||
// Results are cached by robots.txt URL in Redis using an LRU cache
|
||||
// implementation that retains the 100 most recently used values.
|
||||
const urlParser = new URL(url);
|
||||
const robotsUrl = `${urlParser.origin}/robots.txt`;
|
||||
|
||||
const cachedRobots = await crawlState!.getCachedRobots(robotsUrl);
|
||||
// empty string is valid cached empty robots, so check for null
|
||||
if (cachedRobots !== null) {
|
||||
// don't create parser, just skip check if empty string
|
||||
return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null;
|
||||
}
|
||||
|
||||
try {
|
||||
let promise = pendingFetches.get(robotsUrl);
|
||||
|
||||
if (!promise) {
|
||||
promise = timedRun(
|
||||
fetchRobots(robotsUrl, logDetails),
|
||||
ROBOTS_FETCH_TIMEOUT,
|
||||
"Fetching Robots timed out",
|
||||
logDetails,
|
||||
"robots",
|
||||
);
|
||||
pendingFetches.set(robotsUrl, promise);
|
||||
}
|
||||
|
||||
const content = await promise;
|
||||
|
||||
if (content === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
"Caching robots.txt body",
|
||||
{ url: robotsUrl, ...logDetails },
|
||||
"robots",
|
||||
);
|
||||
await crawlState!.setCachedRobots(robotsUrl, content);
|
||||
|
||||
// empty string cached, but no need to create parser
|
||||
return content ? robotsParser(robotsUrl, content) : null;
|
||||
} catch (e) {
|
||||
// ignore
|
||||
} finally {
|
||||
pendingFetches.delete(robotsUrl);
|
||||
}
|
||||
logger.warn(
|
||||
"Failed to fetch robots.txt",
|
||||
{
|
||||
url: robotsUrl,
|
||||
...logDetails,
|
||||
},
|
||||
"robots",
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchRobots(
|
||||
url: string,
|
||||
logDetails: LogDetails,
|
||||
): Promise<string | null> {
|
||||
logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");
|
||||
|
||||
const resp = await fetch(url, {
|
||||
headers,
|
||||
dispatcher: getProxyDispatcher(url),
|
||||
});
|
||||
|
||||
if (resp.ok) {
|
||||
const buff = await resp.arrayBuffer();
|
||||
// only decode and store at most 100K
|
||||
return new TextDecoder().decode(buff.slice(0, 100000));
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
"Robots.txt invalid, storing empty value",
|
||||
{ url, status: resp.status },
|
||||
"robots",
|
||||
);
|
||||
|
||||
// for other status errors, just return empty
|
||||
return "";
|
||||
}
|
||||
|
|
@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";
|
|||
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js";
|
||||
import {
|
||||
MAX_DEPTH,
|
||||
DEFAULT_MAX_RETRIES,
|
||||
ROBOTS_CACHE_LIMIT,
|
||||
} from "./constants.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import { Frame } from "puppeteer-core";
|
||||
import { interpolateFilename, UploadResult } from "./storage.js";
|
||||
|
|
@ -200,7 +204,10 @@ export class RedisCrawlState {
|
|||
fkey: string;
|
||||
ekey: string;
|
||||
bkey: string;
|
||||
rkey: string;
|
||||
lkey: string;
|
||||
pageskey: string;
|
||||
|
||||
esKey: string;
|
||||
esMap: string;
|
||||
|
||||
|
|
@ -233,6 +240,10 @@ export class RedisCrawlState {
|
|||
this.ekey = this.key + ":e";
|
||||
// crawler behavior script messages
|
||||
this.bkey = this.key + ":b";
|
||||
// cached robots.txt bodies (per-origin)
|
||||
this.rkey = this.key + ":r";
|
||||
// LRU cache of robots.txt keys
|
||||
this.lkey = this.key + ":l";
|
||||
// pages
|
||||
this.pageskey = this.key + ":pages";
|
||||
|
||||
|
|
@ -1025,6 +1036,38 @@ return inx;
|
|||
return await this.redis.lpush(this.bkey, behaviorLog);
|
||||
}
|
||||
|
||||
async _updateRobotsAccessTime(robotsUrl: string) {
|
||||
const accessTime = Date.now();
|
||||
await this.redis.zadd(this.lkey, accessTime, robotsUrl);
|
||||
}
|
||||
|
||||
async setCachedRobots(robotsUrl: string, body: string) {
|
||||
await this._updateRobotsAccessTime(robotsUrl);
|
||||
await this.redis.set(`${this.rkey}:${robotsUrl}`, body);
|
||||
|
||||
// prune least-recently used items in zset and robots cache if over limit
|
||||
const cacheCount = await this.redis.zcard(this.lkey);
|
||||
if (cacheCount > ROBOTS_CACHE_LIMIT) {
|
||||
const diff = cacheCount - ROBOTS_CACHE_LIMIT;
|
||||
const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);
|
||||
|
||||
for (const keyToDelete of keysToDelete) {
|
||||
logger.debug(
|
||||
"Deleting cached robots.txt, over cache limit",
|
||||
{ url: keyToDelete },
|
||||
"robots",
|
||||
);
|
||||
await this.redis.del(`${this.rkey}:${keyToDelete}`);
|
||||
await this.redis.zrem(this.lkey, keyToDelete);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async getCachedRobots(robotsUrl: string) {
|
||||
await this._updateRobotsAccessTime(robotsUrl);
|
||||
return await this.redis.get(`${this.rkey}:${robotsUrl}`);
|
||||
}
|
||||
|
||||
async writeToPagesQueue(
|
||||
data: Record<string, string | number | boolean | object>,
|
||||
) {
|
||||
|
|
|
|||
35
tests/robots_txt.test.js
Normal file
35
tests/robots_txt.test.js
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
test("test robots.txt is fetched and cached", async () => {
|
||||
const res = child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
|
||||
);
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
// robots.txt not found
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// robots.txt found and cached
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue