mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Validate Autoclick selector, fail crawl if invalid (#800)
Fixes #798 Also modifies the existing test for link selector validation to check 17 status code on exit when link selectors fail validation. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
parent
47d61a6baf
commit
8f581a587c
2 changed files with 47 additions and 5 deletions
|
@ -700,6 +700,9 @@ class ArgParser {
|
||||||
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
|
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
|
||||||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||||
|
|
||||||
|
// css selector parser
|
||||||
|
const parser = createParser();
|
||||||
|
|
||||||
// Check that the collection name is valid.
|
// Check that the collection name is valid.
|
||||||
if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
|
if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
|
@ -710,6 +713,16 @@ class ArgParser {
|
||||||
// background behaviors to apply
|
// background behaviors to apply
|
||||||
const behaviorOpts: { [key: string]: string | boolean } = {};
|
const behaviorOpts: { [key: string]: string | boolean } = {};
|
||||||
if (argv.behaviors.length > 0) {
|
if (argv.behaviors.length > 0) {
|
||||||
|
if (argv.clickSelector) {
|
||||||
|
try {
|
||||||
|
parser(argv.clickSelector);
|
||||||
|
} catch (e) {
|
||||||
|
logger.fatal("Invalid Autoclick CSS Selector", {
|
||||||
|
selector: argv.clickSelector,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
argv.behaviors.forEach((x: string) => {
|
argv.behaviors.forEach((x: string) => {
|
||||||
if (BEHAVIOR_TYPES.includes(x)) {
|
if (BEHAVIOR_TYPES.includes(x)) {
|
||||||
behaviorOpts[x] = true;
|
behaviorOpts[x] = true;
|
||||||
|
@ -761,8 +774,6 @@ class ArgParser {
|
||||||
|
|
||||||
let selectLinks: ExtractSelector[];
|
let selectLinks: ExtractSelector[];
|
||||||
|
|
||||||
const parser = createParser();
|
|
||||||
|
|
||||||
if (argv.selectLinks) {
|
if (argv.selectLinks) {
|
||||||
selectLinks = argv.selectLinks.map((x: string) => {
|
selectLinks = argv.selectLinks.map((x: string) => {
|
||||||
const parts = x.split("->");
|
const parts = x.split("->");
|
||||||
|
|
|
@ -53,16 +53,47 @@ test("test custom selector crawls JS files as pages", async () => {
|
||||||
|
|
||||||
|
|
||||||
test("test invalid selector, crawl fails", async () => {
|
test("test invalid selector, crawl fails", async () => {
|
||||||
let failed = false;
|
let status = 0;
|
||||||
try {
|
try {
|
||||||
child_process.execSync(
|
child_process.execSync(
|
||||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (e) {
|
||||||
|
status = e.status;
|
||||||
|
}
|
||||||
|
|
||||||
|
// logger fatal exit code
|
||||||
|
expect(status).toBe(17);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("test valid autoclick selector passes validation", async () => {
|
||||||
|
let failed = false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector button --scopeType page",
|
||||||
|
);
|
||||||
|
} catch (e) {
|
||||||
failed = true;
|
failed = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(failed).toBe(true);
|
// valid clickSelector
|
||||||
|
expect(failed).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("test invalid autoclick selector fails validation, crawl fails", async () => {
|
||||||
|
let status = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector \",\" --scopeType page",
|
||||||
|
);
|
||||||
|
} catch (e) {
|
||||||
|
status = e.status;
|
||||||
|
}
|
||||||
|
|
||||||
|
// logger fatal exit code
|
||||||
|
expect(status).toBe(17);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue