From f3074f472966511885c180ab2411d2be18708963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= <9070224-W13R@users.noreply.gitlab.com> Date: Sat, 13 Aug 2022 09:41:12 +0200 Subject: [PATCH 1/3] Rename pip-dependencies.txt to requirements.txt --- README.md | 2 +- pip-dependencies.txt => requirements.txt | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename pip-dependencies.txt => requirements.txt (100%) diff --git a/README.md b/README.md index 6136dbe..16f7553 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Create a graph from the links on a website and the following sites. You can install those dependencies with ``` -pip install -r pip-dependencies.txt +pip install -r requirements.txt ``` ## Usage diff --git a/pip-dependencies.txt b/requirements.txt similarity index 100% rename from pip-dependencies.txt rename to requirements.txt From 56b850dc4fc793687a10a17e3ecab40a6b2fe50e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= <9070224-W13R@users.noreply.gitlab.com> Date: Sat, 13 Aug 2022 09:51:42 +0200 Subject: [PATCH 2/3] Add LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2faedc1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Julian Müller (W13R) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 2d2afe7930e61721daabe339ae3f0d134c34eec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= <9070224-W13R@users.noreply.gitlab.com> Date: Sat, 13 Aug 2022 13:15:50 +0200 Subject: [PATCH 3/3] Refactored, changed to httx and async instead of requests and threads --- lib/args.py | 2 - lib/graph.py | 4 - lib/linkmap_from_sitelinks.py | 141 ++++++++++++++++------------------ linkmapy.py | 25 ++++-- requirements.txt | 2 +- 5 files changed, 84 insertions(+), 90 deletions(-) diff --git a/lib/args.py b/lib/args.py index 24a99a7..c697394 100644 --- a/lib/args.py +++ b/lib/args.py @@ -2,9 +2,7 @@ from argparse import ArgumentParser argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)") - argparser.add_argument("url", help="The URL of the site you want to start from") - argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit") argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3) argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3) diff --git a/lib/graph.py b/lib/graph.py index 34c76aa..2fc6970 100644 --- a/lib/graph.py +++ b/lib/graph.py @@ -1,14 +1,11 @@ - from pyvis.network import Network from .linkmap import LinkMap def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network: - pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True) - pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links)) pyvis_net.add_edges(linkmap.link_connections) @@ -66,6 +63,5 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network: # pyvis_net.show_buttons() pyvis_net.set_options(pyvis_options) - return pyvis_net diff --git a/lib/linkmap_from_sitelinks.py b/lib/linkmap_from_sitelinks.py index 43d1855..b5c2b93 100644 --- a/lib/linkmap_from_sitelinks.py +++ b/lib/linkmap_from_sitelinks.py @@ -1,27 +1,30 @@ +import asyncio -from concurrent.futures import ThreadPoolExecutor from html.parser import HTMLParser from random import sample -from requests import get as request_get from sys import stderr from urllib.parse import urlparse +from httpx import AsyncClient + from .linkmap import LinkMap -class _HTMLExternalLinkFinder(HTMLParser): +class HTMLLinkFinder(HTMLParser): links = [] def handle_starttag(self, tag, attrs): - if tag == "a": - for a in attrs: - attr, val = a - if attr == "href": - if val.startswith("https://") or val.startswith("http://"): - if not val in self.links: - self.links.append(val) + if tag != "a": + return + for a in attrs: + attr, val = a + if attr != "href": + continue + if val.startswith("https://") or val.startswith("http://"): + if not val in self.links: + self.links.append(val) def get_links(self, input_html:str): self.feed(input_html) @@ -35,80 +38,67 @@ class LinkMapFromSitelinksGenerator: generated_linkmap = LinkMap() max_links_per_site = 3 max_depth = 3 - max_threads = 4 enable_log = False def log(self, something): if self.enable_log: print(something, file=stderr) - def _get_html(self, url:str) -> str: - html_content = "" - # receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds - self.log("----" + url) - response = request_get(url, stream=True, timeout=self.site_request_timeout) - response.raise_for_status() + async def _get_html(self, url:str, client:AsyncClient) -> str: + content = bytearray() content_size = 0 - content_chunks = [] - for chunk in response.iter_content(1024, decode_unicode=True): - content_size += len(chunk) - if content_size > self.site_request_max_len: - self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") - break - else: - content_chunks.append(chunk) - html_content = "".join(content_chunks) + # receive up to self.site_request_max_len bytes after + # a maximum of self.site_request_timeout seconds + self.log(f"Request: {url}") + async with client.stream( + "GET", + url, + timeout=self.site_request_timeout, + follow_redirects=True + ) as stream: + async for chunk in stream.aiter_bytes(1024): + content_size += len(chunk) + if content_size > self.site_request_max_len: + self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") + break + else: + content.extend(chunk) + # decode + try: + html_content = content.decode() + except UnicodeDecodeError: + self.log(f"Couldn't decode {url}") + html_content = "" return html_content - def _get_linked_sites_thread(self, urls:list): - def _get_links(url:str): - sites = [] - try: - html = self._get_html(url) - found_links = _HTMLExternalLinkFinder().get_links(html) - found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) - self.log("\n".join(found_links)) - for l in found_links: - if l != None: - sites.append(l) - except KeyboardInterrupt: - exit("KeyboardInterrupt") - except Exception as e: - self.log("An exception occcured while trying to get links from '" + url + "': ") - self.log(e) - return sites - links = {} - for url in urls: - links[url] = _get_links(url) - return links + async def _get_linked_sites_coro(self, url, client:AsyncClient): + linked_sites = [] + try: + html = await self._get_html(url, client) + found_links = HTMLLinkFinder().get_links(html) + found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) + for l in found_links: + self.log(f"Found {l}") + if l != None: + linked_sites.append(l) + except KeyboardInterrupt: + exit("KeyboardInterrupt") + except Exception as e: + self.log("An exception occcured while trying to get links from '" + url + "': ") + self.log(e) + return url, linked_sites - def _get_linked_sites(self, urls:list): - # split urls into self.max_threads chunks - urlchunks = [] - chunk_size = max(int(len(urls) / self.max_threads), 1) - for i in range(self.max_threads): - start = i*chunk_size - end = (i*chunk_size)+chunk_size - new_chunk = urls[start:end] - if len(new_chunk) > 0: - urlchunks.append(new_chunk) - results = [] - # threads - with ThreadPoolExecutor() as tpe: - self.log(f"--Using {len(urlchunks)} concurrent connections...") - futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks] - for f in futures: - # wait for results - results.append(f.result()) - results_combined = {} - for result_chunk in results: - for url in result_chunk: - results_combined[url] = result_chunk[url] - return results_combined + async def _get_linked_sites(self, urls:list, client:AsyncClient): + # get results + results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls]) + results_as_dict = {} + for url, links in results: + results_as_dict[url] = links + return results_as_dict - def _generate_linkmap(self, start_urls:list, _current_depth:int): + async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient): linkdict = {} - linked_sites = self._get_linked_sites(start_urls) + linked_sites = await self._get_linked_sites(start_urls, client) for url in linked_sites: linkdict[url] = {} self.generated_linkmap.add_link(url) @@ -118,13 +108,14 @@ class LinkMapFromSitelinksGenerator: self.generated_linkmap.add_link_connection(url, l)# if _current_depth < self.max_depth: for url in linkdict: - linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1) + linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client) - def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): + async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): self.generated_linkmap = LinkMap() self.max_links_per_site = max_links_per_site self.max_depth = max_depth - self._generate_linkmap([start_url], 1) + async with AsyncClient() as client: + await self._generate_linkmap([start_url], 1, client) def get_linkmap(self) -> LinkMap: return self.generated_linkmap diff --git a/linkmapy.py b/linkmapy.py index fb7617d..2a828d9 100755 --- a/linkmapy.py +++ b/linkmapy.py @@ -1,13 +1,16 @@ #!/usr/bin/env python3 +# Copyright (c) 2021 Julian Müller (W13R) + + +import asyncio + from lib.args import argparser from lib.graph import pyvis_graph_from_linkmap from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator -if __name__ == "__main__": - - args = argparser.parse_args() +async def main(args): nm = LinkMapFromSitelinksGenerator() nm.site_request_max_len = args.http_download_limit nm.enable_log = args.log @@ -15,16 +18,22 @@ if __name__ == "__main__": starturl = args.url if not (starturl.startswith("https://") or starturl.startswith("http://")): starturl = "https://" + starturl - - nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site) + + await nm.generate( + starturl, + max_depth=args.max_depth, + max_links_per_site=args.max_links_per_site + ) if args.dump: - print( "\n".join(str(c) for c in nm.get_linkmap().link_connections) ) - else: - pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl) pyvis_network_graph.show("output.html") + + +if __name__ == "__main__": + args = argparser.parse_args() + asyncio.run(main(args)) diff --git a/requirements.txt b/requirements.txt index 71d9dfe..35bfa31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ pyvis -requests \ No newline at end of file +httpx \ No newline at end of file