mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Gracefully handle non-absolute path for create-login-profile --filename (#521)
Fixes #513 If an absolute path isn't provided to the `create-login-profile` entrypoint's `--filename` option, resolve the value given within `/crawls/profiles`. Also updates the docs cli-options section to include the `create-login-profile` entrypoint and adjusts the script to automatically generate this page accordingly. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
5152169916
commit
1325cc3868
5 changed files with 79 additions and 14 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -8,3 +8,5 @@ test-crawls/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
dist
|
dist
|
||||||
scratch/
|
scratch/
|
||||||
|
venv/
|
||||||
|
docs/venv/
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
# All Command-Line Options
|
# All Command-Line Options
|
||||||
|
|
||||||
The Browsertrix Crawler Docker image currently accepts the following parameters:
|
The Browsertrix Crawler Docker image currently accepts the following parameters, broken down by entrypoint:
|
||||||
|
|
||||||
|
## crawler
|
||||||
|
|
||||||
```
|
```
|
||||||
crawler [options]
|
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--help Show help [boolean]
|
--help Show help [boolean]
|
||||||
--version Show version number [boolean]
|
--version Show version number [boolean]
|
||||||
|
@ -94,14 +94,15 @@ Options:
|
||||||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||||
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
|
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
|
||||||
inks", "sitemap"] [default: []]
|
inks", "sitemap", "replay"] [default: []]
|
||||||
--logExcludeContext Comma-separated list of contexts to
|
--logExcludeContext Comma-separated list of contexts to
|
||||||
NOT include in logs
|
NOT include in logs
|
||||||
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
|
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
|
||||||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||||
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
|
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
|
||||||
inks", "sitemap"] [default: ["recorderNetwork","jsError","screencast"]]
|
inks", "sitemap", "replay"] [default: ["recorderNetwork","jsError","screencast
|
||||||
|
"]]
|
||||||
--text Extract initial (default) or final t
|
--text Extract initial (default) or final t
|
||||||
ext to pages.jsonl or WARC resource
|
ext to pages.jsonl or WARC resource
|
||||||
record(s)
|
record(s)
|
||||||
|
@ -123,9 +124,15 @@ Options:
|
||||||
itemap.xml, or custom URL if URL is
|
itemap.xml, or custom URL if URL is
|
||||||
specified
|
specified
|
||||||
--sitemapFromDate, --sitemapFrom If set, filter URLs from sitemaps to
|
--sitemapFromDate, --sitemapFrom If set, filter URLs from sitemaps to
|
||||||
those greater than or equal to prov
|
those greater than or equal to (>=)
|
||||||
ided ISO Date string (YYYY-MM-DD or
|
provided ISO Date string (YYYY-MM-D
|
||||||
YYYY-MM-DDTHH:MM:SS or partial date)
|
D or YYYY-MM-DDTHH:MM:SS or partial
|
||||||
|
date)
|
||||||
|
--sitemapToDate, --sitemapTo If set, filter URLs from sitemaps to
|
||||||
|
those less than or equal to (<=) pr
|
||||||
|
ovided ISO Date string (YYYY-MM-DD o
|
||||||
|
r YYYY-MM-DDTHH:MM:SS or partial dat
|
||||||
|
e)
|
||||||
--statsFilename If set, output stats as JSON to this
|
--statsFilename If set, output stats as JSON to this
|
||||||
file. (Relative filename resolves t
|
file. (Relative filename resolves t
|
||||||
o crawl working directory)
|
o crawl working directory)
|
||||||
|
@ -239,5 +246,47 @@ Options:
|
||||||
ess (for debugging) [boolean]
|
ess (for debugging) [boolean]
|
||||||
--warcPrefix prefix for WARC files generated, inc
|
--warcPrefix prefix for WARC files generated, inc
|
||||||
luding WARCs added to WACZ [string]
|
luding WARCs added to WACZ [string]
|
||||||
|
--serviceWorker, --sw service worker handling: disabled, e
|
||||||
|
nabled, or disabled with custom prof
|
||||||
|
ile
|
||||||
|
[choices: "disabled", "disabled-if-profile", "enabled"] [default: "disabled"]
|
||||||
|
--qaSource Required for QA mode. Source (WACZ o
|
||||||
|
r multi WACZ) for QA [string]
|
||||||
|
--qaDebugImageDiff if specified, will write crawl.png,
|
||||||
|
replay.png and diff.png for each pag
|
||||||
|
e where they're different [boolean]
|
||||||
--config Path to YAML config file
|
--config Path to YAML config file
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## create-login-profile
|
||||||
|
|
||||||
|
```
|
||||||
|
Options:
|
||||||
|
--help Show help [boolean]
|
||||||
|
--version Show version number [boolean]
|
||||||
|
--url The URL of the login page [string] [required]
|
||||||
|
--user The username for the login. If not specified, will be promp
|
||||||
|
ted
|
||||||
|
--password The password for the login. If not specified, will be promp
|
||||||
|
ted (recommended)
|
||||||
|
--filename The filename for the profile tarball
|
||||||
|
[default: "/crawls/profiles/profile.tar.gz"]
|
||||||
|
--debugScreenshot If specified, take a screenshot after login and save as thi
|
||||||
|
s filename
|
||||||
|
--headless Run in headless mode, otherwise start xvfb
|
||||||
|
[boolean] [default: false]
|
||||||
|
--automated Start in automated mode, no interactive browser
|
||||||
|
[boolean] [default: false]
|
||||||
|
--interactive Deprecated. Now the default option!
|
||||||
|
[boolean] [default: false]
|
||||||
|
--shutdownWait Shutdown browser in interactive after this many seconds, if
|
||||||
|
no pings received [number] [default: 0]
|
||||||
|
--profile Path to tar.gz file which will be extracted and used as the
|
||||||
|
browser profile [string]
|
||||||
|
--windowSize Browser window dimensions, specified as: width,height
|
||||||
|
[string] [default: "1360,1020"]
|
||||||
|
--proxy [boolean] [default: false]
|
||||||
|
--cookieDays If >0, set all cookies, including session cookies, to have
|
||||||
|
this duration in days before saving profile
|
||||||
|
[number] [default: 7]
|
||||||
|
```
|
||||||
|
|
|
@ -4,11 +4,17 @@ CURR=$(dirname "${BASH_SOURCE[0]}")
|
||||||
out=$CURR/docs/user-guide/cli-options.md
|
out=$CURR/docs/user-guide/cli-options.md
|
||||||
echo "# All Command-Line Options" > $out
|
echo "# All Command-Line Options" > $out
|
||||||
echo "" >> $out
|
echo "" >> $out
|
||||||
echo "The Browsertrix Crawler Docker image currently accepts the following parameters:" >> $out
|
echo "The Browsertrix Crawler Docker image currently accepts the following parameters, broken down by entrypoint:" >> $out
|
||||||
|
echo "" >> $out
|
||||||
|
echo "## crawler" >> $out
|
||||||
echo "" >> $out
|
echo "" >> $out
|
||||||
echo '```' >> $out
|
echo '```' >> $out
|
||||||
#node $CURR/../dist/main.js --help >> $out
|
#node $CURR/../dist/main.js --help >> $out
|
||||||
docker run webrecorder/browsertrix-crawler crawl --help >> $out
|
docker run webrecorder/browsertrix-crawler crawl --help | tail -n +3 >> $out
|
||||||
|
echo '```' >> $out
|
||||||
|
echo "" >> $out
|
||||||
|
echo "## create-login-profile" >> $out
|
||||||
|
echo "" >> $out
|
||||||
|
echo '```' >> $out
|
||||||
|
docker run webrecorder/browsertrix-crawler create-login-profile --help | tail -n +3 >> $out
|
||||||
echo '```' >> $out
|
echo '```' >> $out
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "1.1.0-beta.3",
|
"version": "1.1.0-beta.4",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -52,7 +52,8 @@ function cliOpts(): { [key: string]: Options } {
|
||||||
},
|
},
|
||||||
|
|
||||||
filename: {
|
filename: {
|
||||||
describe: "The filename for the profile tarball",
|
describe:
|
||||||
|
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
|
||||||
default: "/crawls/profiles/profile.tar.gz",
|
default: "/crawls/profiles/profile.tar.gz",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -300,6 +301,13 @@ async function createProfile(
|
||||||
|
|
||||||
logger.info("Creating profile");
|
logger.info("Creating profile");
|
||||||
|
|
||||||
|
if (params.filename && !params.filename.startsWith("/")) {
|
||||||
|
params.filename = path.resolve("/crawls/profiles/", params.filename);
|
||||||
|
logger.info(
|
||||||
|
`Absolute path for filename not provided, saving to ${params.filename}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
|
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
|
||||||
|
|
||||||
const outputDir = path.dirname(profileFilename);
|
const outputDir = path.dirname(profileFilename);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue