diff --git a/behaviors.js b/behaviors.js index ad4fa0d3..416d559f 100644 --- a/behaviors.js +++ b/behaviors.js @@ -43,7 +43,7 @@ \******************************/(__unused_webpack_module,__webpack_exports__,__webpack_require__)=>{"use strict";eval('__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpack_require__.d(__webpack_exports__, {\n/* harmony export */ "TelegramBehavior": () => (/* binding */ TelegramBehavior)\n/* harmony export */ });\nconst Q = {\n telegramContainer: "//main//section[@class=\'tgme_channel_history js-message_history\']",\n postId: "string(./div[@data-post]/@data-post)",\n linkExternal: "string(.//a[@class=\'tgme_widget_message_link_preview\' and @href]/@href)",\n};\nclass TelegramBehavior {\n static id = "Telegram";\n static isMatch() {\n return !!window.location.href.match(/https:\\/\\/t.me\\/s\\/\\w[\\w]+/);\n }\n static init() {\n return {\n state: { messages: 0 },\n };\n }\n async waitForPrev(ctx, child) {\n if (!child) {\n return null;\n }\n await ctx.Lib.sleep(ctx.Lib.waitUnit * 5);\n if (!child.previousElementSibling) {\n return null;\n }\n return child.previousElementSibling;\n }\n async *run(ctx) {\n const { getState, scrollIntoView, sleep, waitUnit, xpathNode, xpathString, } = ctx.Lib;\n const root = xpathNode(Q.telegramContainer);\n if (!root) {\n return;\n }\n let child = root.lastElementChild;\n while (child) {\n scrollIntoView(child);\n const postId = xpathString(Q.postId, child) || "unknown";\n const linkUrl = xpathString(Q.linkExternal, child);\n if (linkUrl?.endsWith(".jpg") || linkUrl.endsWith(".png")) {\n yield getState(ctx, "Loading External Image: " + linkUrl);\n const image = new Image();\n image.src = linkUrl;\n document.body.appendChild(image);\n await sleep(waitUnit * 2.5);\n document.body.removeChild(image);\n }\n yield getState(ctx, "Loading Message: " + postId, "messages");\n child = await this.waitForPrev(ctx, child);\n }\n }\n}\n\n\n//# sourceURL=webpack://browsertrix-behaviors/./src/site/telegram.ts?')},"./src/site/tiktok.ts": /*!****************************!*\ !*** ./src/site/tiktok.ts ***! - \****************************/(__unused_webpack_module,__webpack_exports__,__webpack_require__)=>{"use strict";eval('__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpack_require__.d(__webpack_exports__, {\n/* harmony export */ "BREADTH_ALL": () => (/* binding */ BREADTH_ALL),\n/* harmony export */ "TikTokProfileBehavior": () => (/* binding */ TikTokProfileBehavior),\n/* harmony export */ "TikTokSharedBehavior": () => (/* binding */ TikTokSharedBehavior),\n/* harmony export */ "TikTokVideoBehavior": () => (/* binding */ TikTokVideoBehavior)\n/* harmony export */ });\nconst Q = {\n commentButton: "button[aria-label^=\'Read or add comments\']",\n commentList: "//div[contains(@class, \'CommentListContainer\')]",\n commentItem: "div[contains(@class, \'CommentItemContainer\')]",\n viewMoreReplies: ".//p[contains(@class, \'ReplyActionText\')]",\n viewMoreThread: ".//p[starts-with(@data-e2e, \'view-more\') and string-length(text()) > 0]",\n profileVideoList: "//div[starts-with(@data-e2e, \'user-post-item-list\')]",\n profileVideoItem: "div[contains(@class, \'DivItemContainerV2\')]",\n backButton: "button[contains(@class, \'StyledCloseIconContainer\')]",\n pageLoadWaitUntil: "//*[@role=\'dialog\']",\n};\nconst BREADTH_ALL = Symbol("BREADTH_ALL");\nclass TikTokSharedBehavior {\n async awaitPageLoad(ctx) {\n const { assertContentValid, waitUntilNode } = ctx.Lib;\n await waitUntilNode(Q.pageLoadWaitUntil, document, null, 20000);\n assertContentValid(() => !!document.querySelector("*[aria-label=\'Messages\']"), "not_logged_in");\n }\n}\nclass TikTokVideoBehavior extends TikTokSharedBehavior {\n static id = "TikTokVideo";\n static init() {\n return {\n state: { comments: 0 },\n opts: { breadth: BREADTH_ALL },\n };\n }\n static isMatch() {\n const pathRegex = /https:\\/\\/(www\\.)?tiktok\\.com\\/@.+\\/video\\/\\d+\\/?.*/;\n return !!window.location.href.match(pathRegex);\n }\n breadthComplete({ opts: { breadth } }, iter) {\n return breadth !== BREADTH_ALL && breadth <= iter;\n }\n async *crawlThread(ctx, parentNode, prev = null, iter = 0) {\n const { waitUntilNode, scrollAndClick, getState } = ctx.Lib;\n const next = await waitUntilNode(Q.viewMoreThread, parentNode, prev);\n if (!next || this.breadthComplete(ctx, iter))\n return;\n await scrollAndClick(next, 500);\n yield getState(ctx, "View more replies", "comments");\n yield* this.crawlThread(ctx, parentNode, next, iter + 1);\n }\n async *expandThread(ctx, item) {\n const { xpathNode, scrollAndClick, getState } = ctx.Lib;\n const viewMore = xpathNode(Q.viewMoreReplies, item);\n if (!viewMore)\n return;\n await scrollAndClick(viewMore, 500);\n yield getState(ctx, "View comment", "comments");\n yield* this.crawlThread(ctx, item, null, 1);\n }\n async *run(ctx) {\n const { xpathNode, iterChildMatches, scrollIntoView, getState, assertContentValid, sleep } = ctx.Lib;\n const showComments = document.querySelector(Q.commentButton);\n if (showComments) {\n showComments.click();\n await sleep(500);\n }\n void (async () => {\n while (true) {\n if (document.querySelector("div[class*=captcha]")) {\n assertContentValid(false, "not_logged_in");\n break;\n }\n await sleep(500);\n }\n })();\n const commentList = xpathNode(Q.commentList);\n const commentItems = iterChildMatches(Q.commentItem, commentList);\n for await (const item of commentItems) {\n scrollIntoView(item);\n yield getState(ctx, "View comment", "comments");\n if (this.breadthComplete(ctx, 0))\n continue;\n yield* this.expandThread(ctx, item);\n }\n yield getState(ctx, "TikTok Video Behavior Complete");\n }\n}\nclass TikTokProfileBehavior extends TikTokSharedBehavior {\n static id = "TikTokProfile";\n static isMatch() {\n const pathRegex = /https:\\/\\/(www\\.)?tiktok\\.com\\/@[a-zA-Z0-9]+(\\/?$|\\/\\?.*)/;\n return !!window.location.href.match(pathRegex);\n }\n static init() {\n return {\n state: { videos: 0, comments: 0 },\n opts: { breadth: BREADTH_ALL },\n };\n }\n async *openVideo(ctx, item) {\n const { HistoryState, xpathNode, sleep } = ctx.Lib;\n const link = xpathNode(".//a", item);\n if (!link)\n return;\n const viewState = new HistoryState(() => link.click());\n await sleep(500);\n if (viewState.changed) {\n const videoBehavior = new TikTokVideoBehavior();\n yield* videoBehavior.run(ctx);\n await sleep(500);\n await viewState.goBack(Q.backButton);\n }\n }\n async *run(ctx) {\n const { xpathNode, iterChildMatches, scrollIntoView, getState, sleep } = ctx.Lib;\n const profileVideoList = xpathNode(Q.profileVideoList);\n const profileVideos = iterChildMatches(Q.profileVideoItem, profileVideoList);\n for await (const item of profileVideos) {\n scrollIntoView(item);\n yield getState(ctx, "View video", "videos");\n yield* this.openVideo(ctx, item);\n await sleep(500);\n }\n yield getState(ctx, "TikTok Profile Behavior Complete");\n }\n}\n\n\n//# sourceURL=webpack://browsertrix-behaviors/./src/site/tiktok.ts?')},"./src/site/twitter.ts": + \****************************/(__unused_webpack_module,__webpack_exports__,__webpack_require__)=>{"use strict";eval('__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpack_require__.d(__webpack_exports__, {\n/* harmony export */ "BREADTH_ALL": () => (/* binding */ BREADTH_ALL),\n/* harmony export */ "TikTokProfileBehavior": () => (/* binding */ TikTokProfileBehavior),\n/* harmony export */ "TikTokSharedBehavior": () => (/* binding */ TikTokSharedBehavior),\n/* harmony export */ "TikTokVideoBehavior": () => (/* binding */ TikTokVideoBehavior)\n/* harmony export */ });\nconst Q = {\n commentButton: "button[aria-label^=\'Read or add comments\']",\n commentList: "//div[contains(@class, \'CommentListContainer\')]",\n commentItem: "div[contains(@class, \'CommentItemContainer\')]",\n viewMoreReplies: ".//p[contains(@class, \'ReplyActionText\')]",\n viewMoreThread: ".//p[starts-with(@data-e2e, \'view-more\') and string-length(text()) > 0]",\n profileVideoList: "//div[starts-with(@data-e2e, \'user-post-item-list\')]",\n profileVideoItem: "div[contains(@class, \'DivItemContainerV2\')]",\n backButton: "button[contains(@class, \'StyledCloseIconContainer\')]",\n pageLoadWaitUntil: "//*[@role=\'dialog\']",\n};\nconst BREADTH_ALL = Symbol("BREADTH_ALL");\nclass TikTokSharedBehavior {\n async awaitPageLoad(ctx) {\n const { assertContentValid, waitUntilNode } = ctx.Lib;\n await waitUntilNode(Q.pageLoadWaitUntil, document, null, 20000);\n assertContentValid(() => !!document.querySelector("*[aria-label=\'Messages\']"), "not_logged_in");\n }\n}\nclass TikTokVideoBehavior extends TikTokSharedBehavior {\n static id = "TikTokVideo";\n static init() {\n return {\n state: { comments: 0 },\n opts: { breadth: BREADTH_ALL },\n };\n }\n static isMatch() {\n const pathRegex = /https:\\/\\/(www\\.)?tiktok\\.com\\/@.+\\/video\\/\\d+\\/?.*/;\n return !!window.location.href.match(pathRegex);\n }\n breadthComplete({ opts: { breadth } }, iter) {\n return breadth !== BREADTH_ALL && breadth <= iter;\n }\n async *crawlThread(ctx, parentNode, prev = null, iter = 0) {\n const { waitUntilNode, scrollAndClick, getState } = ctx.Lib;\n const next = await waitUntilNode(Q.viewMoreThread, parentNode, prev);\n if (!next || this.breadthComplete(ctx, iter))\n return;\n await scrollAndClick(next, 500);\n yield getState(ctx, "View more replies", "comments");\n yield* this.crawlThread(ctx, parentNode, next, iter + 1);\n }\n async *expandThread(ctx, item) {\n const { xpathNode, scrollAndClick, getState } = ctx.Lib;\n const viewMore = xpathNode(Q.viewMoreReplies, item);\n if (!viewMore)\n return;\n await scrollAndClick(viewMore, 500);\n yield getState(ctx, "View comment", "comments");\n yield* this.crawlThread(ctx, item, null, 1);\n }\n async *run(ctx) {\n const { xpathNode, iterChildMatches, scrollIntoView, getState, assertContentValid, sleep } = ctx.Lib;\n const showComments = document.querySelector(Q.commentButton);\n if (showComments) {\n showComments.click();\n await sleep(10000);\n }\n void (async () => {\n while (true) {\n if (document.querySelector("div[class*=captcha]")) {\n assertContentValid(false, "not_logged_in");\n break;\n }\n await sleep(500);\n }\n })();\n const commentList = xpathNode(Q.commentList);\n const commentItems = iterChildMatches(Q.commentItem, commentList);\n for await (const item of commentItems) {\n scrollIntoView(item);\n yield getState(ctx, "View comment", "comments");\n if (this.breadthComplete(ctx, 0))\n continue;\n yield* this.expandThread(ctx, item);\n }\n yield getState(ctx, "TikTok Video Behavior Complete");\n }\n}\nclass TikTokProfileBehavior extends TikTokSharedBehavior {\n static id = "TikTokProfile";\n static isMatch() {\n const pathRegex = /https:\\/\\/(www\\.)?tiktok\\.com\\/@[a-zA-Z0-9]+(\\/?$|\\/\\?.*)/;\n return !!window.location.href.match(pathRegex);\n }\n static init() {\n return {\n state: { videos: 0, comments: 0 },\n opts: { breadth: BREADTH_ALL },\n };\n }\n async *openVideo(ctx, item) {\n const { HistoryState, xpathNode, sleep } = ctx.Lib;\n const link = xpathNode(".//a", item);\n if (!link)\n return;\n const viewState = new HistoryState(() => link.click());\n await sleep(500);\n if (viewState.changed) {\n const videoBehavior = new TikTokVideoBehavior();\n yield* videoBehavior.run(ctx);\n await sleep(500);\n await viewState.goBack(Q.backButton);\n }\n }\n async *run(ctx) {\n const { xpathNode, iterChildMatches, scrollIntoView, getState, sleep } = ctx.Lib;\n const profileVideoList = xpathNode(Q.profileVideoList);\n const profileVideos = iterChildMatches(Q.profileVideoItem, profileVideoList);\n for await (const item of profileVideos) {\n scrollIntoView(item);\n yield getState(ctx, "View video", "videos");\n yield* this.openVideo(ctx, item);\n await sleep(500);\n }\n yield getState(ctx, "TikTok Profile Behavior Complete");\n }\n}\n\n\n//# sourceURL=webpack://browsertrix-behaviors/./src/site/tiktok.ts?')},"./src/site/twitter.ts": /*!*****************************!*\ !*** ./src/site/twitter.ts ***! \*****************************/(__unused_webpack_module,__webpack_exports__,__webpack_require__)=>{"use strict";eval('__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpack_require__.d(__webpack_exports__, {\n/* harmony export */ "TwitterTimelineBehavior": () => (/* binding */ TwitterTimelineBehavior)\n/* harmony export */ });\nconst Q = {\n rootPath: "//h1[@role=\'heading\' and @aria-level=\'1\']/following-sibling::div[@aria-label]//div[@style]",\n anchor: ".//article",\n childMatchSelect: "string(.//article//a[starts-with(@href, \'/\') and @aria-label]/@href)",\n childMatch: "child::div[.//a[@href=\'$1\']]",\n expand: ".//div[@role=\'button\' and not(@aria-haspopup) and not(@data-testid)]",\n quote: ".//div[@role=\'blockquote\' and @aria-haspopup=\'false\']",\n image: ".//a[@role=\'link\' and starts-with(@href, \'/\') and contains(@href, \'/photo/\')]",\n imageFirstNext: "//div[@aria-roledescription=\'carousel\']/div[2]/div[1]//div[@role=\'button\']",\n imageNext: "//div[@aria-roledescription=\'carousel\']/div[2]/div[2]//div[@role=\'button\']",\n imageClose: "//div[@role=\'presentation\']/div[@role=\'button\' and @aria-label]",\n backButton: "//div[@data-testid=\'titleContainer\']//div[@role=\'button\']",\n viewSensitive: ".//a[@href=\'/settings/content_you_see\']/parent::div/parent::div/parent::div//div[@role=\'button\']",\n progress: ".//*[@role=\'progressbar\']",\n promoted: ".//div[data-testid=\'placementTracking\']",\n};\nclass TwitterTimelineBehavior {\n seenTweets;\n seenMediaTweets;\n static id = "Twitter";\n static isMatch() {\n return !!window.location.href.match(/https:\\/\\/(www\\.)?(x|twitter)\\.com\\//);\n }\n static init() {\n return {\n state: {\n tweets: 0,\n images: 0,\n videos: 0,\n },\n opts: {\n maxDepth: 0,\n },\n };\n }\n constructor() {\n this.seenTweets = new Set();\n this.seenMediaTweets = new Set();\n }\n showingProgressBar(ctx, root) {\n const { xpathNode } = ctx.Lib;\n const node = xpathNode(Q.progress, root);\n if (!node) {\n return false;\n }\n return node.clientHeight > 10;\n }\n async waitForNext(ctx, child) {\n const { sleep, waitUnit } = ctx.Lib;\n if (!child) {\n return null;\n }\n await sleep(waitUnit * 2);\n if (!child.nextElementSibling) {\n return null;\n }\n while (this.showingProgressBar(ctx, child.nextElementSibling)) {\n await sleep(waitUnit);\n }\n return child.nextElementSibling;\n }\n async expandMore(ctx, child) {\n const { sleep, waitUnit, xpathNode } = ctx.Lib;\n const expandElem = xpathNode(Q.expand, child);\n if (!expandElem) {\n return child;\n }\n const prev = child.previousElementSibling;\n expandElem.click();\n await sleep(waitUnit);\n while (this.showingProgressBar(ctx, prev.nextElementSibling)) {\n await sleep(waitUnit);\n }\n child = prev.nextElementSibling;\n return child;\n }\n async *infScroll(ctx) {\n const { scrollIntoView, RestoreState, sleep, waitUnit, xpathNode } = ctx.Lib;\n const root = xpathNode(Q.rootPath);\n if (!root) {\n return;\n }\n let child = root.firstElementChild;\n if (!child) {\n return;\n }\n while (child) {\n let anchorElem = xpathNode(Q.anchor, child);\n if (!anchorElem && Q.expand) {\n child = await this.expandMore(ctx, child);\n anchorElem = xpathNode(Q.anchor, child);\n }\n if (child?.innerText) {\n scrollIntoView(child);\n }\n if (child && anchorElem) {\n await sleep(waitUnit);\n const restorer = new RestoreState(Q.childMatchSelect, child);\n yield anchorElem;\n if (restorer.matchValue) {\n child = await restorer.restore(Q.rootPath, Q.childMatch);\n }\n }\n child = await this.waitForNext(ctx, child);\n }\n }\n async *mediaPlaying(ctx, tweet) {\n const { getState, sleep, xpathNode, xpathString } = ctx.Lib;\n const media = xpathNode("(.//video | .//audio)", tweet);\n if (!media || media.paused) {\n return;\n }\n let mediaTweetUrl = null;\n try {\n mediaTweetUrl = new URL(xpathString(Q.childMatchSelect, tweet.parentElement), window.location.origin).href;\n }\n catch (e) {\n console.warn(e);\n }\n if (media.src.startsWith("https://") && media.src.indexOf(".mp4") > 0) {\n yield getState(ctx, `Loading video for ${mediaTweetUrl || "unknown"}`, "videos");\n return;\n }\n let msg;\n if (mediaTweetUrl) {\n if (this.seenMediaTweets.has(mediaTweetUrl)) {\n return;\n }\n msg = `Waiting for media playback for ${mediaTweetUrl} to finish`;\n this.seenMediaTweets.add(mediaTweetUrl);\n }\n else {\n msg = "Loading video";\n }\n yield getState(ctx, msg, "videos");\n const p = new Promise((resolve) => {\n media.addEventListener("ended", () => resolve(null));\n media.addEventListener("abort", () => resolve(null));\n media.addEventListener("error", () => resolve(null));\n media.addEventListener("pause", () => resolve(null));\n });\n await Promise.race([p, sleep(60000)]);\n }\n async *clickImages(ctx, tweet) {\n const { getState, HistoryState, sleep, waitUnit, xpathNode } = ctx.Lib;\n const imagePopup = xpathNode(Q.image, tweet);\n if (imagePopup) {\n const imageState = new HistoryState(() => imagePopup.click());\n yield getState(ctx, "Loading Image: " + window.location.href, "images");\n await sleep(waitUnit * 5);\n let nextImage = xpathNode(Q.imageFirstNext);\n let prevLocation = window.location.href;\n while (nextImage) {\n nextImage.click();\n await sleep(waitUnit * 2);\n if (window.location.href === prevLocation) {\n await sleep(waitUnit * 5);\n break;\n }\n prevLocation = window.location.href;\n yield getState(ctx, "Loading Image: " + window.location.href, "images");\n await sleep(waitUnit * 5);\n nextImage = xpathNode(Q.imageNext);\n }\n await imageState.goBack(Q.imageClose);\n }\n }\n async *clickTweet(ctx, tweet, depth) {\n const { getState, HistoryState, sleep, waitUnit } = ctx.Lib;\n const tweetState = new HistoryState(() => tweet.click());\n await sleep(waitUnit);\n if (tweetState.changed) {\n yield getState(ctx, "Capturing Tweet: " + window.location.href, "tweets");\n const maxDepth = ctx.opts.maxDepth;\n if (depth < maxDepth && !this.seenTweets.has(window.location.href)) {\n yield* this.iterTimeline(ctx, depth + 1);\n }\n this.seenTweets.add(window.location.href);\n await sleep(waitUnit * 2);\n await tweetState.goBack(Q.backButton);\n await sleep(waitUnit);\n }\n }\n async *iterTimeline(ctx, depth = 0) {\n const { getState, sleep, waitUnit, xpathNode } = ctx.Lib;\n if (this.seenTweets.has(window.location.href)) {\n return;\n }\n yield getState(ctx, "Capturing thread: " + window.location.href, "threads");\n for await (const tweet of this.infScroll(ctx)) {\n if (xpathNode(Q.promoted, tweet)) {\n continue;\n }\n await sleep(waitUnit * 2.5);\n const viewButton = xpathNode(Q.viewSensitive, tweet);\n if (viewButton) {\n viewButton.click();\n await sleep(waitUnit * 2.5);\n }\n yield* this.clickImages(ctx, tweet);\n const quoteTweet = xpathNode(Q.quote, tweet);\n if (quoteTweet) {\n yield* this.clickTweet(ctx, quoteTweet, 1000);\n }\n yield* this.mediaPlaying(ctx, tweet);\n yield* this.clickTweet(ctx, tweet, depth);\n await sleep(waitUnit * 5);\n }\n }\n async *run(ctx) {\n yield* this.iterTimeline(ctx, 0);\n }\n async awaitPageLoad(ctx) {\n const { sleep, assertContentValid } = ctx.Lib;\n await sleep(5);\n assertContentValid(() => !document.documentElement.outerHTML.match(/Log In/i), "not_logged_in");\n }\n}\n\n\n//# sourceURL=webpack://browsertrix-behaviors/./src/site/twitter.ts?')},"./src/site/youtube.ts":