Support custom css selectors for extracting links (#689)

Support array of selectors via --selectLinks property in the
form [css selector]->[property] or [css selector]->@[attribute].
This commit is contained in:
Ilya Kreymer 2024-11-08 08:04:41 -08:00 committed by GitHub
parent 2a9b152531
commit d04509639a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 194 additions and 109 deletions

View file

@ -50,6 +50,11 @@ Options:
e-page-application crawling or when e-page-application crawling or when
different hashtags load dynamic cont different hashtags load dynamic cont
ent ent
--selectLinks one or more selectors for extracting
links, in the format [css selector]
->[property to use],[css selector]->
@[attribute to use]
[array] [default: ["a[href]->href"]]
--blockRules Additional rules for blocking certai --blockRules Additional rules for blocking certai
n URLs from being loaded, by URL reg n URLs from being loaded, by URL reg
ex and optionally via text match in ex and optionally via text match in
@ -70,8 +75,7 @@ Options:
[string] [default: "crawl-@ts"] [string] [default: "crawl-@ts"]
--headless Run in headless mode, otherwise star --headless Run in headless mode, otherwise star
t xvfb [boolean] [default: false] t xvfb [boolean] [default: false]
--driver JS driver for the crawler --driver JS driver for the crawler [string]
[string] [default: "./defaultDriver.js"]
--generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us --generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us
rateCdx e with pywb after crawl is done rateCdx e with pywb after crawl is done
[boolean] [default: false] [boolean] [default: false]
@ -248,8 +252,8 @@ Options:
[boolean] [default: false] [boolean] [default: false]
--customBehaviors Custom behavior files to inject. Val --customBehaviors Custom behavior files to inject. Val
ues can be URLs, paths to individual ues can be URLs, paths to individual
behavior files, or paths to a direct behavior files, or paths to a direc
ory of behavior files. tory of behavior files
[array] [default: []] [array] [default: []]
--debugAccessRedis if set, runs internal redis without --debugAccessRedis if set, runs internal redis without
protected mode to allow external acc protected mode to allow external acc
@ -289,14 +293,14 @@ Options:
--version Show version number [boolean] --version Show version number [boolean]
--url The URL of the login page [string] [required] --url The URL of the login page [string] [required]
--user The username for the login. If not specified, will b --user The username for the login. If not specified, will b
e prompted e prompted [string]
--password The password for the login. If not specified, will b --password The password for the login. If not specified, will b
e prompted (recommended) e prompted (recommended) [string]
--filename The filename for the profile tarball, stored within --filename The filename for the profile tarball, stored within
/crawls/profiles if absolute path not provided /crawls/profiles if absolute path not provided
[default: "/crawls/profiles/profile.tar.gz"] [string] [default: "/crawls/profiles/profile.tar.gz"]
--debugScreenshot If specified, take a screenshot after login and save --debugScreenshot If specified, take a screenshot after login and save
as this filename as this filename [boolean] [default: false]
--headless Run in headless mode, otherwise start xvfb --headless Run in headless mode, otherwise start xvfb
[boolean] [default: false] [boolean] [default: false]
--automated Start in automated mode, no interactive browser --automated Start in automated mode, no interactive browser

View file

@ -17,6 +17,16 @@ can be used to specify additional seconds to wait after the page appears to have
(On the other hand, the `--pageExtraDelay`/`--delay` adds an extra after all post-load actions have taken place, and can be useful for rate-limiting.) (On the other hand, the `--pageExtraDelay`/`--delay` adds an extra after all post-load actions have taken place, and can be useful for rate-limiting.)
## Link Extraction
By default, the crawler will extract all `href` properties from all `<a>` tags that have an `href`.
This can be customized with the `--selectLinks` option, which can provide alternative selectors of the form:
`[css selector]->[property to use]` or `[css selector]->@[attribute to use]`. The default value is `a[href]->href`.
For example, to specify the default, but also include all `divs` that have class `mylink` and use `custom-href` attribute as the link, use `--selectLinks 'a[href]->href' --selectLinks 'div.mylink->@custom-href'`.
Any number of selectors can be specified in this way, and each will be applied in sequence on each page.
## Ad Blocking ## Ad Blocking
Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md). Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md).

View file

@ -20,6 +20,7 @@
"@webrecorder/wabac": "^2.20.0", "@webrecorder/wabac": "^2.20.0",
"browsertrix-behaviors": "^0.6.4", "browsertrix-behaviors": "^0.6.4",
"client-zip": "^2.4.5", "client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0", "fetch-socks": "^1.3.0",
"get-folder-size": "^4.0.0", "get-folder-size": "^4.0.0",
"husky": "^8.0.3", "husky": "^8.0.3",

View file

@ -46,8 +46,8 @@ import { Browser } from "./util/browser.js";
import { import {
ADD_LINK_FUNC, ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC, BEHAVIOR_LOG_FUNC,
DEFAULT_SELECTORS,
DISPLAY, DISPLAY,
ExtractSelector,
PAGE_OP_TIMEOUT_SECS, PAGE_OP_TIMEOUT_SECS,
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS, SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
} from "./util/constants.js"; } from "./util/constants.js";
@ -191,12 +191,14 @@ export class Crawler {
proxyServer?: string; proxyServer?: string;
driver!: (opts: { driver:
page: Page; | ((opts: {
data: PageState; page: Page;
// eslint-disable-next-line no-use-before-define data: PageState;
crawler: Crawler; // eslint-disable-next-line no-use-before-define
}) => Promise<void>; crawler: Crawler;
}) => Promise<void>)
| null = null;
recording: boolean; recording: boolean;
@ -491,6 +493,8 @@ export class Crawler {
logger.info("Seeds", this.seeds); logger.info("Seeds", this.seeds);
logger.info("Link Selectors", this.params.selectLinks);
if (this.params.behaviorOpts) { if (this.params.behaviorOpts) {
logger.info("Behavior Options", this.params.behaviorOpts); logger.info("Behavior Options", this.params.behaviorOpts);
} else { } else {
@ -930,8 +934,12 @@ self.__bx_behaviors.selectMainBehavior();
await page.setExtraHTTPHeaders({}); await page.setExtraHTTPHeaders({});
} }
// run custom driver here // run custom driver here, if any
await this.driver({ page, data, crawler: this }); if (this.driver) {
await this.driver({ page, data, crawler: this });
} else {
await this.loadPage(page, data);
}
data.title = await timedRun( data.title = await timedRun(
page.title(), page.title(),
@ -1347,12 +1355,14 @@ self.__bx_behaviors.selectMainBehavior();
); );
} }
try { if (this.params.driver) {
const driverUrl = new URL(this.params.driver, import.meta.url); try {
this.driver = (await import(driverUrl.href)).default; const driverUrl = new URL(this.params.driver, import.meta.url);
} catch (e) { this.driver = (await import(driverUrl.href)).default;
logger.warn(`Error importing driver ${this.params.driver}`, e); } catch (e) {
return; logger.warn(`Error importing driver ${this.params.driver}`, e);
return;
}
} }
await this.initCrawlState(); await this.initCrawlState();
@ -1741,11 +1751,7 @@ self.__bx_behaviors.selectMainBehavior();
} }
} }
async loadPage( async loadPage(page: Page, data: PageState) {
page: Page,
data: PageState,
selectorOptsList = DEFAULT_SELECTORS,
) {
const { url, depth } = data; const { url, depth } = data;
const logDetails = data.logDetails; const logDetails = data.logDetails;
@ -1946,14 +1952,18 @@ self.__bx_behaviors.selectMainBehavior();
await this.awaitPageLoad(page.mainFrame(), logDetails); await this.awaitPageLoad(page.mainFrame(), logDetails);
// skip extraction if at max depth // skip extraction if at max depth
if (seed.isAtMaxDepth(depth, extraHops) || !selectorOptsList) { if (seed.isAtMaxDepth(depth, extraHops)) {
logger.debug("Skipping Link Extraction, At Max Depth"); logger.debug("Skipping Link Extraction, At Max Depth", {}, "links");
return; return;
} }
logger.debug("Extracting links", logDetails); logger.debug(
"Extracting links",
{ selectors: this.params.selectLinks, ...logDetails },
"links",
);
await this.extractLinks(page, data, selectorOptsList, logDetails); await this.extractLinks(page, data, this.params.selectLinks, logDetails);
} }
async netIdle(page: Page, details: LogDetails) { async netIdle(page: Page, details: LogDetails) {
@ -1999,7 +2009,7 @@ self.__bx_behaviors.selectMainBehavior();
async extractLinks( async extractLinks(
page: Page, page: Page,
data: PageState, data: PageState,
selectors = DEFAULT_SELECTORS, selectors: ExtractSelector[],
logDetails: LogDetails, logDetails: LogDetails,
) { ) {
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data; const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
@ -2045,11 +2055,7 @@ self.__bx_behaviors.selectMainBehavior();
const frames = filteredFrames || page.frames(); const frames = filteredFrames || page.frames();
try { try {
for (const { for (const { selector, extract, isAttribute } of selectors) {
selector = "a[href]",
extract = "href",
isAttribute = false,
} of selectors) {
await Promise.allSettled( await Promise.allSettled(
frames.map((frame) => { frames.map((frame) => {
const getLinks = frame const getLinks = frame

View file

@ -1,15 +0,0 @@
import { Page } from "puppeteer-core";
import { PageState } from "./util/state.js";
import { Crawler } from "./crawler.js";
export default async ({
data,
page,
crawler,
}: {
data: PageState;
page: Page;
crawler: Crawler;
}) => {
await crawler.loadPage(page, data);
};

View file

@ -7,11 +7,15 @@ import { KnownDevices as devices } from "puppeteer-core";
import yargs from "yargs"; import yargs from "yargs";
import { hideBin } from "yargs/helpers"; import { hideBin } from "yargs/helpers";
import { createParser } from "css-selector-parser";
import { import {
BEHAVIOR_LOG_FUNC, BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS, WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES, EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS, SERVICE_WORKER_OPTS,
DEFAULT_SELECTORS,
ExtractSelector,
} from "./constants.js"; } from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js"; import { interpolateFilename } from "./storage.js";
@ -32,6 +36,8 @@ export type CrawlerArgs = ReturnType<typeof parseArgs> & {
scopedSeeds: ScopedSeed[]; scopedSeeds: ScopedSeed[];
selectLinks: ExtractSelector[];
crawlId: string; crawlId: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -156,6 +162,14 @@ class ArgParser {
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content", "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
}, },
selectLinks: {
describe:
"One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]",
type: "array",
default: ["a[href]->href"],
coerce,
},
blockRules: { blockRules: {
describe: describe:
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe", "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
@ -200,9 +214,8 @@ class ArgParser {
}, },
driver: { driver: {
describe: "JS driver for the crawler", describe: "Custom driver for the crawler, if any",
type: "string", type: "string",
default: "./defaultDriver.js",
}, },
generateCDX: { generateCDX: {
@ -714,6 +727,30 @@ class ArgParser {
} }
} }
let selectLinks: ExtractSelector[];
const parser = createParser();
if (argv.selectLinks) {
selectLinks = argv.selectLinks.map((x: string) => {
const parts = x.split("->");
const selector = parts[0];
const value = parts[1] || "";
const extract = parts.length > 1 ? value.replace("@", "") : "href";
const isAttribute = value.startsWith("@");
try {
parser(selector);
} catch (e) {
logger.fatal("Invalid Link Extraction CSS Selector", { selector });
}
return { selector, extract, isAttribute };
});
} else {
selectLinks = DEFAULT_SELECTORS;
}
argv.selectLinks = selectLinks;
if (argv.netIdleWait === -1) { if (argv.netIdleWait === -1) {
if (argv.scopeType === "page" || argv.scopeType === "page-spa") { if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
argv.netIdleWait = 15; argv.netIdleWait = 15;

View file

@ -30,7 +30,13 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5; export const PAGE_OP_TIMEOUT_SECS = 5;
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
export const DEFAULT_SELECTORS = [ export type ExtractSelector = {
selector: string;
extract: string;
isAttribute: boolean;
};
export const DEFAULT_SELECTORS: ExtractSelector[] = [
{ {
selector: "a[href]", selector: "a[href]",
extract: "href", extract: "href",

View file

@ -1,52 +1,15 @@
import child_process from "child_process"; import child_process from "child_process";
import fs from "fs"; import fs from "fs";
test("ensure custom driver with custom selector crawls JS files as pages", async () => { test("ensure custom driver creates PDF", async () => {
try { try {
child_process.execSync( child_process.execSync(
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs", "docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs --limit 1",
); );
} catch (error) { } catch (error) {
console.log(error); console.log(error);
} }
const crawledPages = fs.readFileSync( const pdfs = fs.readdirSync("test-crawls/collections/custom-driver-1").filter(x => x.endsWith(".pdf"));
"test-crawls/collections/custom-driver-1/pages/pages.jsonl", expect(pdfs.length).toBe(1);
"utf8",
);
const pages = new Set();
for (const line of crawledPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
pages.add(url);
}
const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/custom-driver-1/pages/extraPages.jsonl",
"utf8",
);
const extraPages = new Set();
for (const line of crawledExtraPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
extraPages.add(url);
}
const expectedPages = new Set([
"https://www.iana.org/",
]);
const expectedExtraPages = new Set([
"https://www.iana.org/_js/jquery.js",
"https://www.iana.org/_js/iana.js",
]);
expect(pages).toEqual(expectedPages);
expect(extraPages).toEqual(expectedExtraPages);
}); });

View file

@ -0,0 +1,68 @@
import child_process from "child_process";
import fs from "fs";
test("test custom selector crawls JS files as pages", async () => {
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-1 --selectLinks \"script[src]->src\"",
);
} catch (error) {
console.log(error);
}
const crawledPages = fs.readFileSync(
"test-crawls/collections/custom-sel-1/pages/pages.jsonl",
"utf8",
);
const pages = new Set();
for (const line of crawledPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
pages.add(url);
}
const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/custom-sel-1/pages/extraPages.jsonl",
"utf8",
);
const extraPages = new Set();
for (const line of crawledExtraPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
extraPages.add(url);
}
const expectedPages = new Set([
"https://www.iana.org/",
]);
const expectedExtraPages = new Set([
"https://www.iana.org/_js/jquery.js",
"https://www.iana.org/_js/iana.js",
]);
expect(pages).toEqual(expectedPages);
expect(extraPages).toEqual(expectedExtraPages);
});
test("test invalid selector, crawl fails", async () => {
let failed = false;
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
);
} catch (error) {
failed = true;
}
expect(failed).toBe(true);
});

View file

@ -1,5 +1,5 @@
export default async ({ data, page, crawler }) => { export default async ({ data, page, crawler }) => {
await crawler.loadPage(page, data, [ await crawler.loadPage(page, data);
{ selector: "script[src]", extract: "src", isAttribute: false },
]); await page.pdf({"path": `${crawler.collDir}/${data.pageid}.pdf`});
}; };

View file

@ -1312,16 +1312,16 @@
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
"@webrecorder/wabac@^2.20.0-beta.4": "@webrecorder/wabac@^2.20.0":
version "2.20.0-beta.4" version "2.20.1"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.0-beta.4.tgz#c60fcd00f449cca52ce1a0bef305a06922c9e3e8" resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.1.tgz#58e397e2ef1c33de1bb37aa4f51fc7f3eec8a1f7"
integrity sha512-enHYcZoqs7cOu2tdTqVeB/zB27uL4wmCMzvF55bJqdB8d5zgPpY+/fpRA3eLxGrPc0nFYAjsI/aNaa62FH7WKQ== integrity sha512-RX+U6m7aVgvsAfLb9FuLY/PcHCNL5dc1FPaD0GnUiFgswSSe5v4MjIhqJNOnbrJYEcbib81AJfxNuvOyXAJDJQ==
dependencies: dependencies:
"@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3" "@peculiar/asn1-schema" "^2.3.3"
"@peculiar/x509" "^1.9.2" "@peculiar/x509" "^1.9.2"
"@types/js-levenshtein" "^1.1.3" "@types/js-levenshtein" "^1.1.3"
"@webrecorder/wombat" "^3.8.2" "@webrecorder/wombat" "^3.8.3"
acorn "^8.10.0" acorn "^8.10.0"
auto-js-ipfs "^2.1.1" auto-js-ipfs "^2.1.1"
base64-js "^1.5.1" base64-js "^1.5.1"
@ -1342,10 +1342,10 @@
stream-browserify "^3.0.0" stream-browserify "^3.0.0"
warcio "^2.3.1" warcio "^2.3.1"
"@webrecorder/wombat@^3.8.2": "@webrecorder/wombat@^3.8.3":
version "3.8.2" version "3.8.3"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.2.tgz#e46e18719834d633175eec52ce753a4dc4e48e27" resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.3.tgz#c5a077225d1a70def9fbbbfcd50fa4465d236546"
integrity sha512-uUZr9V4UYpVOpM64Tm27ND/hMjDbT37+/qyNaNV6loqDuVzBVQh5w7SfTEy0Bbjj1MYyNZP244mOtWtotTpUEA== integrity sha512-dqgoxigB3OdX5JeB3yxJrUNwFwUBlYC+LmGrLEgGeP259MFzXQLD2pmfuqGt5ygWvIv56SrAMV4sUceux07X2A==
dependencies: dependencies:
warcio "^2.3.1" warcio "^2.3.1"
@ -1963,6 +1963,11 @@ crypto-random-string@^4.0.0:
dependencies: dependencies:
type-fest "^1.0.1" type-fest "^1.0.1"
css-selector-parser@^3.0.5:
version "3.0.5"
resolved "https://registry.yarnpkg.com/css-selector-parser/-/css-selector-parser-3.0.5.tgz#9b636ebccf7c4bcce5c1ac21ae27de9f01180ae9"
integrity sha512-3itoDFbKUNx1eKmVpYMFyqKX04Ww9osZ+dLgrk6GEv6KMVeXUhUnp4I5X+evw+u3ZxVU6RFXSSRxlTeMh8bA+g==
data-uri-to-buffer@^5.0.1: data-uri-to-buffer@^5.0.1:
version "5.0.1" version "5.0.1"
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c" resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c"