diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 92459b5..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2022 Julian Müller (ChaoticByte) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md index d59d861..6136dbe 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,13 @@ Create a graph from the links on a website and the following sites. ### Pip Dependencies +- pyvis +- requests + You can install those dependencies with ``` -pip install -r requirements.txt +pip install -r pip-dependencies.txt ``` ## Usage diff --git a/lib/args.py b/lib/args.py index c697394..24a99a7 100644 --- a/lib/args.py +++ b/lib/args.py @@ -2,7 +2,9 @@ from argparse import ArgumentParser argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)") + argparser.add_argument("url", help="The URL of the site you want to start from") + argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit") argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3) argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3) diff --git a/lib/graph.py b/lib/graph.py index 2fc6970..34c76aa 100644 --- a/lib/graph.py +++ b/lib/graph.py @@ -1,11 +1,14 @@ + from pyvis.network import Network from .linkmap import LinkMap def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network: + pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True) + pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links)) pyvis_net.add_edges(linkmap.link_connections) @@ -63,5 +66,6 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network: # pyvis_net.show_buttons() pyvis_net.set_options(pyvis_options) + return pyvis_net diff --git a/lib/linkmap_from_sitelinks.py b/lib/linkmap_from_sitelinks.py index b5c2b93..43d1855 100644 --- a/lib/linkmap_from_sitelinks.py +++ b/lib/linkmap_from_sitelinks.py @@ -1,30 +1,27 @@ -import asyncio +from concurrent.futures import ThreadPoolExecutor from html.parser import HTMLParser from random import sample +from requests import get as request_get from sys import stderr from urllib.parse import urlparse -from httpx import AsyncClient - from .linkmap import LinkMap -class HTMLLinkFinder(HTMLParser): +class _HTMLExternalLinkFinder(HTMLParser): links = [] def handle_starttag(self, tag, attrs): - if tag != "a": - return - for a in attrs: - attr, val = a - if attr != "href": - continue - if val.startswith("https://") or val.startswith("http://"): - if not val in self.links: - self.links.append(val) + if tag == "a": + for a in attrs: + attr, val = a + if attr == "href": + if val.startswith("https://") or val.startswith("http://"): + if not val in self.links: + self.links.append(val) def get_links(self, input_html:str): self.feed(input_html) @@ -38,67 +35,80 @@ class LinkMapFromSitelinksGenerator: generated_linkmap = LinkMap() max_links_per_site = 3 max_depth = 3 + max_threads = 4 enable_log = False def log(self, something): if self.enable_log: print(something, file=stderr) - async def _get_html(self, url:str, client:AsyncClient) -> str: - content = bytearray() + def _get_html(self, url:str) -> str: + html_content = "" + # receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds + self.log("----" + url) + response = request_get(url, stream=True, timeout=self.site_request_timeout) + response.raise_for_status() content_size = 0 - # receive up to self.site_request_max_len bytes after - # a maximum of self.site_request_timeout seconds - self.log(f"Request: {url}") - async with client.stream( - "GET", - url, - timeout=self.site_request_timeout, - follow_redirects=True - ) as stream: - async for chunk in stream.aiter_bytes(1024): - content_size += len(chunk) - if content_size > self.site_request_max_len: - self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") - break - else: - content.extend(chunk) - # decode - try: - html_content = content.decode() - except UnicodeDecodeError: - self.log(f"Couldn't decode {url}") - html_content = "" + content_chunks = [] + for chunk in response.iter_content(1024, decode_unicode=True): + content_size += len(chunk) + if content_size > self.site_request_max_len: + self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") + break + else: + content_chunks.append(chunk) + html_content = "".join(content_chunks) return html_content - async def _get_linked_sites_coro(self, url, client:AsyncClient): - linked_sites = [] - try: - html = await self._get_html(url, client) - found_links = HTMLLinkFinder().get_links(html) - found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) - for l in found_links: - self.log(f"Found {l}") - if l != None: - linked_sites.append(l) - except KeyboardInterrupt: - exit("KeyboardInterrupt") - except Exception as e: - self.log("An exception occcured while trying to get links from '" + url + "': ") - self.log(e) - return url, linked_sites + def _get_linked_sites_thread(self, urls:list): + def _get_links(url:str): + sites = [] + try: + html = self._get_html(url) + found_links = _HTMLExternalLinkFinder().get_links(html) + found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) + self.log("\n".join(found_links)) + for l in found_links: + if l != None: + sites.append(l) + except KeyboardInterrupt: + exit("KeyboardInterrupt") + except Exception as e: + self.log("An exception occcured while trying to get links from '" + url + "': ") + self.log(e) + return sites + links = {} + for url in urls: + links[url] = _get_links(url) + return links - async def _get_linked_sites(self, urls:list, client:AsyncClient): - # get results - results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls]) - results_as_dict = {} - for url, links in results: - results_as_dict[url] = links - return results_as_dict + def _get_linked_sites(self, urls:list): + # split urls into self.max_threads chunks + urlchunks = [] + chunk_size = max(int(len(urls) / self.max_threads), 1) + for i in range(self.max_threads): + start = i*chunk_size + end = (i*chunk_size)+chunk_size + new_chunk = urls[start:end] + if len(new_chunk) > 0: + urlchunks.append(new_chunk) + results = [] + # threads + with ThreadPoolExecutor() as tpe: + self.log(f"--Using {len(urlchunks)} concurrent connections...") + futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks] + for f in futures: + # wait for results + results.append(f.result()) + results_combined = {} + for result_chunk in results: + for url in result_chunk: + results_combined[url] = result_chunk[url] + return results_combined - async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient): + def _generate_linkmap(self, start_urls:list, _current_depth:int): linkdict = {} - linked_sites = await self._get_linked_sites(start_urls, client) + linked_sites = self._get_linked_sites(start_urls) for url in linked_sites: linkdict[url] = {} self.generated_linkmap.add_link(url) @@ -108,14 +118,13 @@ class LinkMapFromSitelinksGenerator: self.generated_linkmap.add_link_connection(url, l)# if _current_depth < self.max_depth: for url in linkdict: - linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client) + linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1) - async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): + def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): self.generated_linkmap = LinkMap() self.max_links_per_site = max_links_per_site self.max_depth = max_depth - async with AsyncClient() as client: - await self._generate_linkmap([start_url], 1, client) + self._generate_linkmap([start_url], 1) def get_linkmap(self) -> LinkMap: return self.generated_linkmap diff --git a/linkmapy.py b/linkmapy.py index c481b43..fb7617d 100755 --- a/linkmapy.py +++ b/linkmapy.py @@ -1,16 +1,13 @@ #!/usr/bin/env python3 -# Copyright (c) 2021 Julian Müller (ChaoticByte) - - -import asyncio - from lib.args import argparser from lib.graph import pyvis_graph_from_linkmap from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator +if __name__ == "__main__": + + args = argparser.parse_args() -async def main(args): nm = LinkMapFromSitelinksGenerator() nm.site_request_max_len = args.http_download_limit nm.enable_log = args.log @@ -18,22 +15,16 @@ async def main(args): starturl = args.url if not (starturl.startswith("https://") or starturl.startswith("http://")): starturl = "https://" + starturl - - await nm.generate( - starturl, - max_depth=args.max_depth, - max_links_per_site=args.max_links_per_site - ) + + nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site) if args.dump: + print( "\n".join(str(c) for c in nm.get_linkmap().link_connections) ) + else: + pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl) pyvis_network_graph.show("output.html") - - -if __name__ == "__main__": - args = argparser.parse_args() - asyncio.run(main(args)) diff --git a/pip-dependencies.txt b/pip-dependencies.txt new file mode 100644 index 0000000..71d9dfe --- /dev/null +++ b/pip-dependencies.txt @@ -0,0 +1,2 @@ +pyvis +requests \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 35bfa31..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyvis -httpx \ No newline at end of file