Gracefully handle non-absolute path for create-login-profile --filename (#521)

Fixes #513 

If an absolute path isn't provided to the `create-login-profile`
entrypoint's `--filename` option, resolve the value given within
`/crawls/profiles`.

Also updates the docs cli-options section to include the
`create-login-profile` entrypoint and adjusts the script to
automatically generate this page accordingly.

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2024-03-29 16:46:54 -04:00 committed by GitHub
parent 5152169916
commit 1325cc3868
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 79 additions and 14 deletions

2
.gitignore vendored
View file

@ -8,3 +8,5 @@ test-crawls/
.DS_Store
dist
scratch/
venv/
docs/venv/

View file

@ -1,10 +1,10 @@
# All Command-Line Options
The Browsertrix Crawler Docker image currently accepts the following parameters:
The Browsertrix Crawler Docker image currently accepts the following parameters, broken down by entrypoint:
## crawler
```
crawler [options]
Options:
--help Show help [boolean]
--version Show version number [boolean]
@ -94,14 +94,15 @@ Options:
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
inks", "sitemap"] [default: []]
inks", "sitemap", "replay"] [default: []]
--logExcludeContext Comma-separated list of contexts to
NOT include in logs
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
inks", "sitemap"] [default: ["recorderNetwork","jsError","screencast"]]
inks", "sitemap", "replay"] [default: ["recorderNetwork","jsError","screencast
"]]
--text Extract initial (default) or final t
ext to pages.jsonl or WARC resource
record(s)
@ -123,9 +124,15 @@ Options:
itemap.xml, or custom URL if URL is
specified
--sitemapFromDate, --sitemapFrom If set, filter URLs from sitemaps to
those greater than or equal to prov
ided ISO Date string (YYYY-MM-DD or
YYYY-MM-DDTHH:MM:SS or partial date)
those greater than or equal to (>=)
provided ISO Date string (YYYY-MM-D
D or YYYY-MM-DDTHH:MM:SS or partial
date)
--sitemapToDate, --sitemapTo If set, filter URLs from sitemaps to
those less than or equal to (<=) pr
ovided ISO Date string (YYYY-MM-DD o
r YYYY-MM-DDTHH:MM:SS or partial dat
e)
--statsFilename If set, output stats as JSON to this
file. (Relative filename resolves t
o crawl working directory)
@ -239,5 +246,47 @@ Options:
ess (for debugging) [boolean]
--warcPrefix prefix for WARC files generated, inc
luding WARCs added to WACZ [string]
--serviceWorker, --sw service worker handling: disabled, e
nabled, or disabled with custom prof
ile
[choices: "disabled", "disabled-if-profile", "enabled"] [default: "disabled"]
--qaSource Required for QA mode. Source (WACZ o
r multi WACZ) for QA [string]
--qaDebugImageDiff if specified, will write crawl.png,
replay.png and diff.png for each pag
e where they're different [boolean]
--config Path to YAML config file
```
## create-login-profile
```
Options:
--help Show help [boolean]
--version Show version number [boolean]
--url The URL of the login page [string] [required]
--user The username for the login. If not specified, will be promp
ted
--password The password for the login. If not specified, will be promp
ted (recommended)
--filename The filename for the profile tarball
[default: "/crawls/profiles/profile.tar.gz"]
--debugScreenshot If specified, take a screenshot after login and save as thi
s filename
--headless Run in headless mode, otherwise start xvfb
[boolean] [default: false]
--automated Start in automated mode, no interactive browser
[boolean] [default: false]
--interactive Deprecated. Now the default option!
[boolean] [default: false]
--shutdownWait Shutdown browser in interactive after this many seconds, if
no pings received [number] [default: 0]
--profile Path to tar.gz file which will be extracted and used as the
browser profile [string]
--windowSize Browser window dimensions, specified as: width,height
[string] [default: "1360,1020"]
--proxy [boolean] [default: false]
--cookieDays If >0, set all cookies, including session cookies, to have
this duration in days before saving profile
[number] [default: 7]
```

View file

@ -4,11 +4,17 @@ CURR=$(dirname "${BASH_SOURCE[0]}")
out=$CURR/docs/user-guide/cli-options.md
echo "# All Command-Line Options" > $out
echo "" >> $out
echo "The Browsertrix Crawler Docker image currently accepts the following parameters:" >> $out
echo "The Browsertrix Crawler Docker image currently accepts the following parameters, broken down by entrypoint:" >> $out
echo "" >> $out
echo "## crawler" >> $out
echo "" >> $out
echo '```' >> $out
#node $CURR/../dist/main.js --help >> $out
docker run webrecorder/browsertrix-crawler crawl --help >> $out
docker run webrecorder/browsertrix-crawler crawl --help | tail -n +3 >> $out
echo '```' >> $out
echo "" >> $out
echo "## create-login-profile" >> $out
echo "" >> $out
echo '```' >> $out
docker run webrecorder/browsertrix-crawler create-login-profile --help | tail -n +3 >> $out
echo '```' >> $out

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.1.0-beta.3",
"version": "1.1.0-beta.4",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -52,7 +52,8 @@ function cliOpts(): { [key: string]: Options } {
},
filename: {
describe: "The filename for the profile tarball",
describe:
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
default: "/crawls/profiles/profile.tar.gz",
},
@ -300,6 +301,13 @@ async function createProfile(
logger.info("Creating profile");
if (params.filename && !params.filename.startsWith("/")) {
params.filename = path.resolve("/crawls/profiles/", params.filename);
logger.info(
`Absolute path for filename not provided, saving to ${params.filename}`,
);
}
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
const outputDir = path.dirname(profileFilename);