2022-10-24 15:30:10 +02:00
|
|
|
import child_process from "child_process";
|
|
|
|
import fs from "fs";
|
|
|
|
import yaml from "js-yaml";
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
function runCrawl(name, config, commandExtra = "") {
|
|
|
|
config.generateCDX = true;
|
|
|
|
config.depth = 0;
|
|
|
|
config.collection = name;
|
|
|
|
|
|
|
|
const configYaml = yaml.dump(config);
|
|
|
|
|
|
|
|
try {
|
2021-07-22 17:46:10 -07:00
|
|
|
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
console.log(proc);
|
|
|
|
}
|
|
|
|
catch (error) {
|
|
|
|
console.log(error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function doesCDXContain(coll, value) {
|
|
|
|
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
|
|
|
return data.indexOf(value) >= 0;
|
|
|
|
}
|
|
|
|
|
2023-10-09 09:41:50 -07:00
|
|
|
// Test Disabled for Brave -- should always be blocked, but seeing inconsistent ci behavior
|
|
|
|
/*
|
2021-07-19 15:49:43 -07:00
|
|
|
test("test crawl without block for specific URL", () => {
|
|
|
|
const config = {
|
|
|
|
"url": "https://www.iana.org/",
|
2023-10-02 14:30:44 -07:00
|
|
|
"pageExtraDelay": 10
|
2021-07-19 15:49:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-1-no-block", config);
|
|
|
|
|
|
|
|
// without blocks, URL with add sense is included
|
|
|
|
expect(doesCDXContain("block-1-no-block", "https://cse.google.com/adsense/search/async-ads.js")).toBe(true);
|
|
|
|
});
|
2023-10-09 09:41:50 -07:00
|
|
|
*/
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
|
|
|
|
test("test block rule on specific URL", () => {
|
|
|
|
const config = {
|
|
|
|
"url": "https://www.iana.org/",
|
|
|
|
"blockRules": [
|
|
|
|
{"url": "adsense"}
|
|
|
|
]
|
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-1", config);
|
|
|
|
|
|
|
|
expect(doesCDXContain("block-1", "https://cse.google.com/adsense/search/async-ads.js")).toBe(false);
|
|
|
|
});
|
|
|
|
|
|
|
|
test("test block rule based on iframe text, content included due to match", () => {
|
|
|
|
const config = {
|
|
|
|
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
"blockRules": [{
|
|
|
|
"url": "https://www.youtube.com/embed/",
|
|
|
|
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
|
|
|
"type": "allowOnly"
|
|
|
|
}]
|
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-2", config);
|
|
|
|
|
|
|
|
expect(doesCDXContain("block-2", "\"video/mp4\"")).toBe(true);
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
test("test block rule based on iframe text, wrong text, content should be excluded", () => {
|
|
|
|
const config = {
|
|
|
|
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
"blockRules": [{
|
|
|
|
"url": "https://www.youtube.com/embed/",
|
|
|
|
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\\"",
|
|
|
|
"type": "allowOnly"
|
|
|
|
}]
|
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-3", config);
|
|
|
|
|
|
|
|
expect(doesCDXContain("block-3", "\"video/mp4\"")).toBe(false);
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
test("test block rule based on iframe text, block matched", () => {
|
|
|
|
const config = {
|
|
|
|
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
"blockRules": [{
|
|
|
|
"url": "https://www.youtube.com/embed/",
|
|
|
|
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
|
|
|
}]
|
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-4", config);
|
|
|
|
|
|
|
|
expect(doesCDXContain("block-4", "\"video/mp4\"")).toBe(false);
|
|
|
|
});
|
|
|
|
|
|
|
|
test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
|
|
|
|
const config = {
|
|
|
|
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
"blockRules": [{
|
|
|
|
"url": "example.com/embed/",
|
|
|
|
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
|
|
|
"type": "block"
|
|
|
|
}, {
|
|
|
|
"url": "(youtube.com|example.com)/embed/",
|
|
|
|
"type": "allowOnly",
|
|
|
|
"inFrameUrl": "oembed.link/",
|
|
|
|
}]
|
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("non-block-5", config);
|
|
|
|
|
|
|
|
expect(doesCDXContain("non-block-5", "\"video/mp4\"")).toBe(true);
|
|
|
|
});
|
|
|
|
|
|
|
|
test("test block url in frame url", () => {
|
|
|
|
const config = {
|
|
|
|
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
"blockRules": [{
|
|
|
|
"url": "maxresdefault.jpg",
|
|
|
|
"type": "block",
|
|
|
|
"inFrameUrl": "youtube.com/embed",
|
|
|
|
}]
|
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-6", config);
|
|
|
|
|
|
|
|
expect(doesCDXContain("block-6", "\"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg\"")).toBe(false);
|
|
|
|
});
|
|
|
|
|
|
|
|
|
2021-07-27 09:41:21 -07:00
|
|
|
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
|
|
|
|
const config = {
|
|
|
|
"seeds": [
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
"https://archiveweb.page/en/troubleshooting/errors/",
|
2021-07-27 09:41:21 -07:00
|
|
|
],
|
|
|
|
"depth": "0",
|
|
|
|
"blockRules": [{
|
|
|
|
"url": "(archiveweb.page|www.youtube.com)",
|
|
|
|
"type": "allowOnly",
|
|
|
|
"inFrameUrl": "archiveweb.page"
|
|
|
|
}, {
|
|
|
|
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
|
|
|
"inFrameUrl": "archiveweb.page"
|
|
|
|
}, {
|
|
|
|
"url": "https://www.youtube.com/embed/",
|
|
|
|
"type": "allowOnly",
|
|
|
|
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
|
|
|
|
}],
|
|
|
|
|
|
|
|
"combineWARC": true,
|
|
|
|
|
|
|
|
"logging": "stats,debug"
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
runCrawl("block-7", config);
|
|
|
|
|
|
|
|
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
|
|
|
|
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
|
|
|
|
});
|
|
|
|
|
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
|