Gracefully handle non-absolute path for create-login-profile --filename (#521)

Fixes #513 

If an absolute path isn't provided to the `create-login-profile`
entrypoint's `--filename` option, resolve the value given within
`/crawls/profiles`.

Also updates the docs cli-options section to include the
`create-login-profile` entrypoint and adjusts the script to
automatically generate this page accordingly.

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2024-03-29 16:46:54 -04:00 committed by GitHub
parent 5152169916
commit 1325cc3868
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 79 additions and 14 deletions

2
.gitignore vendored
View file

@ -8,3 +8,5 @@ test-crawls/
.DS_Store .DS_Store
dist dist
scratch/ scratch/
venv/
docs/venv/

View file

@ -1,10 +1,10 @@
# All Command-Line Options # All Command-Line Options
The Browsertrix Crawler Docker image currently accepts the following parameters: The Browsertrix Crawler Docker image currently accepts the following parameters, broken down by entrypoint:
## crawler
``` ```
crawler [options]
Options: Options:
--help Show help [boolean] --help Show help [boolean]
--version Show version number [boolean] --version Show version number [boolean]
@ -94,14 +94,15 @@ Options:
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
inks", "sitemap"] [default: []] inks", "sitemap", "replay"] [default: []]
--logExcludeContext Comma-separated list of contexts to --logExcludeContext Comma-separated list of contexts to
NOT include in logs NOT include in logs
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer" [array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
inks", "sitemap"] [default: ["recorderNetwork","jsError","screencast"]] inks", "sitemap", "replay"] [default: ["recorderNetwork","jsError","screencast
"]]
--text Extract initial (default) or final t --text Extract initial (default) or final t
ext to pages.jsonl or WARC resource ext to pages.jsonl or WARC resource
record(s) record(s)
@ -123,9 +124,15 @@ Options:
itemap.xml, or custom URL if URL is itemap.xml, or custom URL if URL is
specified specified
--sitemapFromDate, --sitemapFrom If set, filter URLs from sitemaps to --sitemapFromDate, --sitemapFrom If set, filter URLs from sitemaps to
those greater than or equal to prov those greater than or equal to (>=)
ided ISO Date string (YYYY-MM-DD or provided ISO Date string (YYYY-MM-D
YYYY-MM-DDTHH:MM:SS or partial date) D or YYYY-MM-DDTHH:MM:SS or partial
date)
--sitemapToDate, --sitemapTo If set, filter URLs from sitemaps to
those less than or equal to (<=) pr
ovided ISO Date string (YYYY-MM-DD o
r YYYY-MM-DDTHH:MM:SS or partial dat
e)
--statsFilename If set, output stats as JSON to this --statsFilename If set, output stats as JSON to this
file. (Relative filename resolves t file. (Relative filename resolves t
o crawl working directory) o crawl working directory)
@ -239,5 +246,47 @@ Options:
ess (for debugging) [boolean] ess (for debugging) [boolean]
--warcPrefix prefix for WARC files generated, inc --warcPrefix prefix for WARC files generated, inc
luding WARCs added to WACZ [string] luding WARCs added to WACZ [string]
--serviceWorker, --sw service worker handling: disabled, e
nabled, or disabled with custom prof
ile
[choices: "disabled", "disabled-if-profile", "enabled"] [default: "disabled"]
--qaSource Required for QA mode. Source (WACZ o
r multi WACZ) for QA [string]
--qaDebugImageDiff if specified, will write crawl.png,
replay.png and diff.png for each pag
e where they're different [boolean]
--config Path to YAML config file --config Path to YAML config file
``` ```
## create-login-profile
```
Options:
--help Show help [boolean]
--version Show version number [boolean]
--url The URL of the login page [string] [required]
--user The username for the login. If not specified, will be promp
ted
--password The password for the login. If not specified, will be promp
ted (recommended)
--filename The filename for the profile tarball
[default: "/crawls/profiles/profile.tar.gz"]
--debugScreenshot If specified, take a screenshot after login and save as thi
s filename
--headless Run in headless mode, otherwise start xvfb
[boolean] [default: false]
--automated Start in automated mode, no interactive browser
[boolean] [default: false]
--interactive Deprecated. Now the default option!
[boolean] [default: false]
--shutdownWait Shutdown browser in interactive after this many seconds, if
no pings received [number] [default: 0]
--profile Path to tar.gz file which will be extracted and used as the
browser profile [string]
--windowSize Browser window dimensions, specified as: width,height
[string] [default: "1360,1020"]
--proxy [boolean] [default: false]
--cookieDays If >0, set all cookies, including session cookies, to have
this duration in days before saving profile
[number] [default: 7]
```

View file

@ -4,11 +4,17 @@ CURR=$(dirname "${BASH_SOURCE[0]}")
out=$CURR/docs/user-guide/cli-options.md out=$CURR/docs/user-guide/cli-options.md
echo "# All Command-Line Options" > $out echo "# All Command-Line Options" > $out
echo "" >> $out echo "" >> $out
echo "The Browsertrix Crawler Docker image currently accepts the following parameters:" >> $out echo "The Browsertrix Crawler Docker image currently accepts the following parameters, broken down by entrypoint:" >> $out
echo "" >> $out
echo "## crawler" >> $out
echo "" >> $out echo "" >> $out
echo '```' >> $out echo '```' >> $out
#node $CURR/../dist/main.js --help >> $out #node $CURR/../dist/main.js --help >> $out
docker run webrecorder/browsertrix-crawler crawl --help >> $out docker run webrecorder/browsertrix-crawler crawl --help | tail -n +3 >> $out
echo '```' >> $out
echo "" >> $out
echo "## create-login-profile" >> $out
echo "" >> $out
echo '```' >> $out
docker run webrecorder/browsertrix-crawler create-login-profile --help | tail -n +3 >> $out
echo '```' >> $out echo '```' >> $out

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "1.1.0-beta.3", "version": "1.1.0-beta.4",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"type": "module", "type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -52,7 +52,8 @@ function cliOpts(): { [key: string]: Options } {
}, },
filename: { filename: {
describe: "The filename for the profile tarball", describe:
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
default: "/crawls/profiles/profile.tar.gz", default: "/crawls/profiles/profile.tar.gz",
}, },
@ -300,6 +301,13 @@ async function createProfile(
logger.info("Creating profile"); logger.info("Creating profile");
if (params.filename && !params.filename.startsWith("/")) {
params.filename = path.resolve("/crawls/profiles/", params.filename);
logger.info(
`Absolute path for filename not provided, saving to ${params.filename}`,
);
}
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz"; const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
const outputDir = path.dirname(profileFilename); const outputDir = path.dirname(profileFilename);