diff --git a/docs/docs/user-guide/cli-options.md b/docs/docs/user-guide/cli-options.md index c92d8008..fb449c97 100644 --- a/docs/docs/user-guide/cli-options.md +++ b/docs/docs/user-guide/cli-options.md @@ -50,6 +50,11 @@ Options: e-page-application crawling or when different hashtags load dynamic cont ent + --selectLinks one or more selectors for extracting + links, in the format [css selector] + ->[property to use],[css selector]-> + @[attribute to use] + [array] [default: ["a[href]->href"]] --blockRules Additional rules for blocking certai n URLs from being loaded, by URL reg ex and optionally via text match in @@ -70,8 +75,7 @@ Options: [string] [default: "crawl-@ts"] --headless Run in headless mode, otherwise star t xvfb [boolean] [default: false] - --driver JS driver for the crawler - [string] [default: "./defaultDriver.js"] + --driver JS driver for the crawler [string] --generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us rateCdx e with pywb after crawl is done [boolean] [default: false] @@ -248,8 +252,8 @@ Options: [boolean] [default: false] --customBehaviors Custom behavior files to inject. Val ues can be URLs, paths to individual - behavior files, or paths to a direct - ory of behavior files. + behavior files, or paths to a direc + tory of behavior files [array] [default: []] --debugAccessRedis if set, runs internal redis without protected mode to allow external acc @@ -289,14 +293,14 @@ Options: --version Show version number [boolean] --url The URL of the login page [string] [required] --user The username for the login. If not specified, will b - e prompted + e prompted [string] --password The password for the login. If not specified, will b - e prompted (recommended) + e prompted (recommended) [string] --filename The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided - [default: "/crawls/profiles/profile.tar.gz"] + [string] [default: "/crawls/profiles/profile.tar.gz"] --debugScreenshot If specified, take a screenshot after login and save - as this filename + as this filename [boolean] [default: false] --headless Run in headless mode, otherwise start xvfb [boolean] [default: false] --automated Start in automated mode, no interactive browser diff --git a/docs/docs/user-guide/common-options.md b/docs/docs/user-guide/common-options.md index f8772751..91860a05 100644 --- a/docs/docs/user-guide/common-options.md +++ b/docs/docs/user-guide/common-options.md @@ -17,6 +17,16 @@ can be used to specify additional seconds to wait after the page appears to have (On the other hand, the `--pageExtraDelay`/`--delay` adds an extra after all post-load actions have taken place, and can be useful for rate-limiting.) +## Link Extraction + +By default, the crawler will extract all `href` properties from all `` tags that have an `href`. +This can be customized with the `--selectLinks` option, which can provide alternative selectors of the form: +`[css selector]->[property to use]` or `[css selector]->@[attribute to use]`. The default value is `a[href]->href`. + +For example, to specify the default, but also include all `divs` that have class `mylink` and use `custom-href` attribute as the link, use `--selectLinks 'a[href]->href' --selectLinks 'div.mylink->@custom-href'`. + +Any number of selectors can be specified in this way, and each will be applied in sequence on each page. + ## Ad Blocking Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md). diff --git a/package.json b/package.json index a42ae2a7..d2175935 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "@webrecorder/wabac": "^2.20.0", "browsertrix-behaviors": "^0.6.4", "client-zip": "^2.4.5", + "css-selector-parser": "^3.0.5", "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", diff --git a/src/crawler.ts b/src/crawler.ts index 66107868..f18f6a7a 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -46,8 +46,8 @@ import { Browser } from "./util/browser.js"; import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, - DEFAULT_SELECTORS, DISPLAY, + ExtractSelector, PAGE_OP_TIMEOUT_SECS, SITEMAP_INITIAL_FETCH_TIMEOUT_SECS, } from "./util/constants.js"; @@ -191,12 +191,14 @@ export class Crawler { proxyServer?: string; - driver!: (opts: { - page: Page; - data: PageState; - // eslint-disable-next-line no-use-before-define - crawler: Crawler; - }) => Promise; + driver: + | ((opts: { + page: Page; + data: PageState; + // eslint-disable-next-line no-use-before-define + crawler: Crawler; + }) => Promise) + | null = null; recording: boolean; @@ -491,6 +493,8 @@ export class Crawler { logger.info("Seeds", this.seeds); + logger.info("Link Selectors", this.params.selectLinks); + if (this.params.behaviorOpts) { logger.info("Behavior Options", this.params.behaviorOpts); } else { @@ -930,8 +934,12 @@ self.__bx_behaviors.selectMainBehavior(); await page.setExtraHTTPHeaders({}); } - // run custom driver here - await this.driver({ page, data, crawler: this }); + // run custom driver here, if any + if (this.driver) { + await this.driver({ page, data, crawler: this }); + } else { + await this.loadPage(page, data); + } data.title = await timedRun( page.title(), @@ -1347,12 +1355,14 @@ self.__bx_behaviors.selectMainBehavior(); ); } - try { - const driverUrl = new URL(this.params.driver, import.meta.url); - this.driver = (await import(driverUrl.href)).default; - } catch (e) { - logger.warn(`Error importing driver ${this.params.driver}`, e); - return; + if (this.params.driver) { + try { + const driverUrl = new URL(this.params.driver, import.meta.url); + this.driver = (await import(driverUrl.href)).default; + } catch (e) { + logger.warn(`Error importing driver ${this.params.driver}`, e); + return; + } } await this.initCrawlState(); @@ -1741,11 +1751,7 @@ self.__bx_behaviors.selectMainBehavior(); } } - async loadPage( - page: Page, - data: PageState, - selectorOptsList = DEFAULT_SELECTORS, - ) { + async loadPage(page: Page, data: PageState) { const { url, depth } = data; const logDetails = data.logDetails; @@ -1946,14 +1952,18 @@ self.__bx_behaviors.selectMainBehavior(); await this.awaitPageLoad(page.mainFrame(), logDetails); // skip extraction if at max depth - if (seed.isAtMaxDepth(depth, extraHops) || !selectorOptsList) { - logger.debug("Skipping Link Extraction, At Max Depth"); + if (seed.isAtMaxDepth(depth, extraHops)) { + logger.debug("Skipping Link Extraction, At Max Depth", {}, "links"); return; } - logger.debug("Extracting links", logDetails); + logger.debug( + "Extracting links", + { selectors: this.params.selectLinks, ...logDetails }, + "links", + ); - await this.extractLinks(page, data, selectorOptsList, logDetails); + await this.extractLinks(page, data, this.params.selectLinks, logDetails); } async netIdle(page: Page, details: LogDetails) { @@ -1999,7 +2009,7 @@ self.__bx_behaviors.selectMainBehavior(); async extractLinks( page: Page, data: PageState, - selectors = DEFAULT_SELECTORS, + selectors: ExtractSelector[], logDetails: LogDetails, ) { const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data; @@ -2045,11 +2055,7 @@ self.__bx_behaviors.selectMainBehavior(); const frames = filteredFrames || page.frames(); try { - for (const { - selector = "a[href]", - extract = "href", - isAttribute = false, - } of selectors) { + for (const { selector, extract, isAttribute } of selectors) { await Promise.allSettled( frames.map((frame) => { const getLinks = frame diff --git a/src/defaultDriver.ts b/src/defaultDriver.ts deleted file mode 100644 index caf1f338..00000000 --- a/src/defaultDriver.ts +++ /dev/null @@ -1,15 +0,0 @@ -import { Page } from "puppeteer-core"; -import { PageState } from "./util/state.js"; -import { Crawler } from "./crawler.js"; - -export default async ({ - data, - page, - crawler, -}: { - data: PageState; - page: Page; - crawler: Crawler; -}) => { - await crawler.loadPage(page, data); -}; diff --git a/src/util/argParser.ts b/src/util/argParser.ts index bf5b61b1..2572e1d5 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -7,11 +7,15 @@ import { KnownDevices as devices } from "puppeteer-core"; import yargs from "yargs"; import { hideBin } from "yargs/helpers"; +import { createParser } from "css-selector-parser"; + import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES, SERVICE_WORKER_OPTS, + DEFAULT_SELECTORS, + ExtractSelector, } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; @@ -32,6 +36,8 @@ export type CrawlerArgs = ReturnType & { scopedSeeds: ScopedSeed[]; + selectLinks: ExtractSelector[]; + crawlId: string; // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -156,6 +162,14 @@ class ArgParser { "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content", }, + selectLinks: { + describe: + "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]", + type: "array", + default: ["a[href]->href"], + coerce, + }, + blockRules: { describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe", @@ -200,9 +214,8 @@ class ArgParser { }, driver: { - describe: "JS driver for the crawler", + describe: "Custom driver for the crawler, if any", type: "string", - default: "./defaultDriver.js", }, generateCDX: { @@ -714,6 +727,30 @@ class ArgParser { } } + let selectLinks: ExtractSelector[]; + + const parser = createParser(); + + if (argv.selectLinks) { + selectLinks = argv.selectLinks.map((x: string) => { + const parts = x.split("->"); + const selector = parts[0]; + const value = parts[1] || ""; + const extract = parts.length > 1 ? value.replace("@", "") : "href"; + const isAttribute = value.startsWith("@"); + try { + parser(selector); + } catch (e) { + logger.fatal("Invalid Link Extraction CSS Selector", { selector }); + } + return { selector, extract, isAttribute }; + }); + } else { + selectLinks = DEFAULT_SELECTORS; + } + + argv.selectLinks = selectLinks; + if (argv.netIdleWait === -1) { if (argv.scopeType === "page" || argv.scopeType === "page-spa") { argv.netIdleWait = 15; diff --git a/src/util/constants.ts b/src/util/constants.ts index 681debf0..6d914def 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -30,7 +30,13 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const PAGE_OP_TIMEOUT_SECS = 5; export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; -export const DEFAULT_SELECTORS = [ +export type ExtractSelector = { + selector: string; + extract: string; + isAttribute: boolean; +}; + +export const DEFAULT_SELECTORS: ExtractSelector[] = [ { selector: "a[href]", extract: "href", diff --git a/tests/custom_driver.test.js b/tests/custom_driver.test.js index 9cb0942e..7fed31d9 100644 --- a/tests/custom_driver.test.js +++ b/tests/custom_driver.test.js @@ -1,52 +1,15 @@ import child_process from "child_process"; import fs from "fs"; -test("ensure custom driver with custom selector crawls JS files as pages", async () => { +test("ensure custom driver creates PDF", async () => { try { child_process.execSync( - "docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs", + "docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs --limit 1", ); } catch (error) { console.log(error); } - const crawledPages = fs.readFileSync( - "test-crawls/collections/custom-driver-1/pages/pages.jsonl", - "utf8", - ); - const pages = new Set(); - - for (const line of crawledPages.trim().split("\n")) { - const url = JSON.parse(line).url; - if (!url) { - continue; - } - pages.add(url); - } - - const crawledExtraPages = fs.readFileSync( - "test-crawls/collections/custom-driver-1/pages/extraPages.jsonl", - "utf8", - ); - const extraPages = new Set(); - - for (const line of crawledExtraPages.trim().split("\n")) { - const url = JSON.parse(line).url; - if (!url) { - continue; - } - extraPages.add(url); - } - - const expectedPages = new Set([ - "https://www.iana.org/", - ]); - - const expectedExtraPages = new Set([ - "https://www.iana.org/_js/jquery.js", - "https://www.iana.org/_js/iana.js", - ]); - - expect(pages).toEqual(expectedPages); - expect(extraPages).toEqual(expectedExtraPages); + const pdfs = fs.readdirSync("test-crawls/collections/custom-driver-1").filter(x => x.endsWith(".pdf")); + expect(pdfs.length).toBe(1); }); diff --git a/tests/custom_selector.test.js b/tests/custom_selector.test.js new file mode 100644 index 00000000..f1e2b7af --- /dev/null +++ b/tests/custom_selector.test.js @@ -0,0 +1,68 @@ +import child_process from "child_process"; +import fs from "fs"; + +test("test custom selector crawls JS files as pages", async () => { + try { + child_process.execSync( + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-1 --selectLinks \"script[src]->src\"", + ); + } catch (error) { + console.log(error); + } + + const crawledPages = fs.readFileSync( + "test-crawls/collections/custom-sel-1/pages/pages.jsonl", + "utf8", + ); + const pages = new Set(); + + for (const line of crawledPages.trim().split("\n")) { + const url = JSON.parse(line).url; + if (!url) { + continue; + } + pages.add(url); + } + + const crawledExtraPages = fs.readFileSync( + "test-crawls/collections/custom-sel-1/pages/extraPages.jsonl", + "utf8", + ); + const extraPages = new Set(); + + for (const line of crawledExtraPages.trim().split("\n")) { + const url = JSON.parse(line).url; + if (!url) { + continue; + } + extraPages.add(url); + } + + const expectedPages = new Set([ + "https://www.iana.org/", + ]); + + const expectedExtraPages = new Set([ + "https://www.iana.org/_js/jquery.js", + "https://www.iana.org/_js/iana.js", + ]); + + expect(pages).toEqual(expectedPages); + expect(extraPages).toEqual(expectedExtraPages); +}); + + +test("test invalid selector, crawl fails", async () => { + let failed = false; + try { + child_process.execSync( + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"", + ); + } catch (error) { + failed = true; + } + + expect(failed).toBe(true); +}); + + diff --git a/tests/fixtures/driver-1.mjs b/tests/fixtures/driver-1.mjs index daa82607..ade14b78 100644 --- a/tests/fixtures/driver-1.mjs +++ b/tests/fixtures/driver-1.mjs @@ -1,5 +1,5 @@ export default async ({ data, page, crawler }) => { - await crawler.loadPage(page, data, [ - { selector: "script[src]", extract: "src", isAttribute: false }, - ]); + await crawler.loadPage(page, data); + + await page.pdf({"path": `${crawler.collDir}/${data.pageid}.pdf`}); }; diff --git a/yarn.lock b/yarn.lock index 8b0b963e..662e8bff 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1312,16 +1312,16 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.20.0-beta.4": - version "2.20.0-beta.4" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.0-beta.4.tgz#c60fcd00f449cca52ce1a0bef305a06922c9e3e8" - integrity sha512-enHYcZoqs7cOu2tdTqVeB/zB27uL4wmCMzvF55bJqdB8d5zgPpY+/fpRA3eLxGrPc0nFYAjsI/aNaa62FH7WKQ== +"@webrecorder/wabac@^2.20.0": + version "2.20.1" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.1.tgz#58e397e2ef1c33de1bb37aa4f51fc7f3eec8a1f7" + integrity sha512-RX+U6m7aVgvsAfLb9FuLY/PcHCNL5dc1FPaD0GnUiFgswSSe5v4MjIhqJNOnbrJYEcbib81AJfxNuvOyXAJDJQ== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" "@peculiar/x509" "^1.9.2" "@types/js-levenshtein" "^1.1.3" - "@webrecorder/wombat" "^3.8.2" + "@webrecorder/wombat" "^3.8.3" acorn "^8.10.0" auto-js-ipfs "^2.1.1" base64-js "^1.5.1" @@ -1342,10 +1342,10 @@ stream-browserify "^3.0.0" warcio "^2.3.1" -"@webrecorder/wombat@^3.8.2": - version "3.8.2" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.2.tgz#e46e18719834d633175eec52ce753a4dc4e48e27" - integrity sha512-uUZr9V4UYpVOpM64Tm27ND/hMjDbT37+/qyNaNV6loqDuVzBVQh5w7SfTEy0Bbjj1MYyNZP244mOtWtotTpUEA== +"@webrecorder/wombat@^3.8.3": + version "3.8.3" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.3.tgz#c5a077225d1a70def9fbbbfcd50fa4465d236546" + integrity sha512-dqgoxigB3OdX5JeB3yxJrUNwFwUBlYC+LmGrLEgGeP259MFzXQLD2pmfuqGt5ygWvIv56SrAMV4sUceux07X2A== dependencies: warcio "^2.3.1" @@ -1963,6 +1963,11 @@ crypto-random-string@^4.0.0: dependencies: type-fest "^1.0.1" +css-selector-parser@^3.0.5: + version "3.0.5" + resolved "https://registry.yarnpkg.com/css-selector-parser/-/css-selector-parser-3.0.5.tgz#9b636ebccf7c4bcce5c1ac21ae27de9f01180ae9" + integrity sha512-3itoDFbKUNx1eKmVpYMFyqKX04Ww9osZ+dLgrk6GEv6KMVeXUhUnp4I5X+evw+u3ZxVU6RFXSSRxlTeMh8bA+g== + data-uri-to-buffer@^5.0.1: version "5.0.1" resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c"