lang code fixes: (#834)

- validate --lang values, fail immediately with invalid iso-639-1
country code
- ignore --lang value when using profile, print warning that profile
language takes precedence
- fixes #833
This commit is contained in:
Ilya Kreymer 2025-05-12 16:06:29 -07:00 committed by GitHub
parent e39d5a31eb
commit 71de8d6582
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 44 additions and 1 deletions

View file

@ -26,6 +26,7 @@
"get-folder-size": "^4.0.0",
"husky": "^8.0.3",
"ioredis": "^5.3.2",
"iso-639-1": "^3.1.5",
"js-levenshtein": "^1.1.6",
"js-yaml": "^4.1.0",
"minio": "^7.1.3",

View file

@ -592,7 +592,14 @@ export class Crawler {
extraChromeArgs() {
const args = [];
if (this.params.lang) {
args.push(`--accept-lang=${this.params.lang}`);
if (this.params.profile) {
logger.warn(
"Ignoring --lang option with profile, using language configured in the profile",
{ lang: this.params.lang },
);
} else {
args.push(`--accept-lang=${this.params.lang}`);
}
}
return args;
}

View file

@ -1,5 +1,6 @@
import path from "path";
import fs from "fs";
import ISO6391 from "iso-639-1";
import yaml from "js-yaml";
import { KnownDevices as devices } from "puppeteer-core";
@ -770,6 +771,12 @@ class ArgParser {
argv.emulateDevice = { viewport: null };
}
if (argv.lang) {
if (!ISO6391.validate(argv.lang)) {
logger.fatal("Invalid ISO-639-1 country code for --lang: " + argv.lang);
}
}
if (argv.seedFile) {
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
const urlSeedFileList = urlSeedFile.split("\n");

23
tests/lang-code.test.js Normal file
View file

@ -0,0 +1,23 @@
import { execSync } from "child_process";
test("run crawl with invalid lang", () => {
let status = 0;
try {
execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang e --limit 1`);
} catch (e) {
status = e.status;
}
expect(status).toBe(17);
});
test("run crawl with valid lang", () => {
let status = 0;
try {
execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang en --limit 1`);
} catch (e) {
status = e.status;
}
expect(status).toBe(0);
});

View file

@ -3269,6 +3269,11 @@ isexe@^2.0.0:
resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10"
integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==
iso-639-1@^3.1.5:
version "3.1.5"
resolved "https://registry.yarnpkg.com/iso-639-1/-/iso-639-1-3.1.5.tgz#e8205aceeeea0f64d6b12f5fac6a943b0d5b452c"
integrity sha512-gXkz5+KN7HrG0Q5UGqSMO2qB9AsbEeyLP54kF1YrMsIxmu+g4BdB7rflReZTSTZGpfj8wywu6pfPBCylPIzGQA==
istanbul-lib-coverage@^3.0.0, istanbul-lib-coverage@^3.2.0:
version "3.2.2"
resolved "https://registry.yarnpkg.com/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz#2d166c4b0644d43a39f04bf6c2edd1e585f31756"