diff --git a/package.json b/package.json index b7f6bed5..9f7a31f8 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "get-folder-size": "^4.0.0", "husky": "^8.0.3", "ioredis": "^5.3.2", + "iso-639-1": "^3.1.5", "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", "minio": "^7.1.3", diff --git a/src/crawler.ts b/src/crawler.ts index 2b46b6b9..0311481d 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -592,7 +592,14 @@ export class Crawler { extraChromeArgs() { const args = []; if (this.params.lang) { - args.push(`--accept-lang=${this.params.lang}`); + if (this.params.profile) { + logger.warn( + "Ignoring --lang option with profile, using language configured in the profile", + { lang: this.params.lang }, + ); + } else { + args.push(`--accept-lang=${this.params.lang}`); + } } return args; } diff --git a/src/util/argParser.ts b/src/util/argParser.ts index eed6d935..c0564ec9 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -1,5 +1,6 @@ import path from "path"; import fs from "fs"; +import ISO6391 from "iso-639-1"; import yaml from "js-yaml"; import { KnownDevices as devices } from "puppeteer-core"; @@ -770,6 +771,12 @@ class ArgParser { argv.emulateDevice = { viewport: null }; } + if (argv.lang) { + if (!ISO6391.validate(argv.lang)) { + logger.fatal("Invalid ISO-639-1 country code for --lang: " + argv.lang); + } + } + if (argv.seedFile) { const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8"); const urlSeedFileList = urlSeedFile.split("\n"); diff --git a/tests/lang-code.test.js b/tests/lang-code.test.js new file mode 100644 index 00000000..a99387fe --- /dev/null +++ b/tests/lang-code.test.js @@ -0,0 +1,23 @@ +import { execSync } from "child_process"; + +test("run crawl with invalid lang", () => { + let status = 0; + try { + execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang e --limit 1`); + } catch (e) { + status = e.status; + } + expect(status).toBe(17); +}); + +test("run crawl with valid lang", () => { + let status = 0; + try { + execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang en --limit 1`); + } catch (e) { + status = e.status; + } + expect(status).toBe(0); +}); + + diff --git a/yarn.lock b/yarn.lock index 241134f0..014c7691 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3269,6 +3269,11 @@ isexe@^2.0.0: resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10" integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw== +iso-639-1@^3.1.5: + version "3.1.5" + resolved "https://registry.yarnpkg.com/iso-639-1/-/iso-639-1-3.1.5.tgz#e8205aceeeea0f64d6b12f5fac6a943b0d5b452c" + integrity sha512-gXkz5+KN7HrG0Q5UGqSMO2qB9AsbEeyLP54kF1YrMsIxmu+g4BdB7rflReZTSTZGpfj8wywu6pfPBCylPIzGQA== + istanbul-lib-coverage@^3.0.0, istanbul-lib-coverage@^3.2.0: version "3.2.2" resolved "https://registry.yarnpkg.com/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz#2d166c4b0644d43a39f04bf6c2edd1e585f31756"