From 1f02b8d26580cb703bcc79d2bebf51640f8bae8a Mon Sep 17 00:00:00 2001 From: W13R <9070224-W13R@users.noreply.gitlab.com> Date: Thu, 9 Dec 2021 10:55:29 +0100 Subject: [PATCH] Add existing project files --- .gitignore | 2 + lib/args.py | 12 ++++ lib/graph.py | 68 +++++++++++++++++++++++ lib/linkmap.py | 120 ++++++++++++++++++++++++++++++++++++++++ lib/linkmap2pandasdf.py | 21 +++++++ linkmapy.py | 37 +++++++++++++ pip-dependencies.txt | 3 + 7 files changed, 263 insertions(+) create mode 100644 .gitignore create mode 100644 lib/args.py create mode 100644 lib/graph.py create mode 100644 lib/linkmap.py create mode 100644 lib/linkmap2pandasdf.py create mode 100644 linkmapy.py create mode 100644 pip-dependencies.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2526a02 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +output.html +__pycache__ \ No newline at end of file diff --git a/lib/args.py b/lib/args.py new file mode 100644 index 0000000..24a99a7 --- /dev/null +++ b/lib/args.py @@ -0,0 +1,12 @@ + +from argparse import ArgumentParser + +argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)") + +argparser.add_argument("url", help="The URL of the site you want to start from") + +argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit") +argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3) +argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3) +argparser.add_argument("--http-download-limit", metavar="NBYTES", type=int, help="The maximum length of a requested html file download (in bytes) (default: 10000000)", default=10000000) +argparser.add_argument("--log", action="store_true", default=False, help="Log all visited sites and links to stderr") diff --git a/lib/graph.py b/lib/graph.py new file mode 100644 index 0000000..744a5a2 --- /dev/null +++ b/lib/graph.py @@ -0,0 +1,68 @@ + +from networkx import from_pandas_edgelist +from pandas import DataFrame +from pyvis.network import Network + +def pyvis_graph_from_pandas_DF(pandas_df:DataFrame, source_column:str="link1", target_column:str="link2", heading:str=None) -> Network: + + nx = from_pandas_edgelist(pandas_df, source=source_column, target=target_column) + pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%") + pyvis_net.from_nx(nx, default_node_size=8) + + if heading != None: + pyvis_net.heading = heading + """ + + """ + + pyvis_options = """ + var options = { + "nodes": { + "font": { + "size": 12 + } + }, + "edges": { + "arrows": { + "to": { + "enabled": true, + "scaleFactor": 0.3 + } + }, + "color": { + "inherit": true + }, + "smooth": false + }, + "interaction": { + "hover": false + }, + "physics": { + "barnesHut": { + "centralGravity": 0, + "springLength": 200, + "springConstant": 0.01, + "avoidOverlap": 0 + }, + "minVelocity": 0.75 + } + } + """ + + # pyvis_net.show_buttons() + pyvis_net.set_options(pyvis_options) + + return pyvis_net + diff --git a/lib/linkmap.py b/lib/linkmap.py new file mode 100644 index 0000000..daf61e2 --- /dev/null +++ b/lib/linkmap.py @@ -0,0 +1,120 @@ + + +from html.parser import HTMLParser +from random import sample +from requests import get as request_get +from sys import stderr +from urllib.parse import urlparse + + +class _HTMLExternalLinkFinder(HTMLParser): + + links = [] + + def handle_starttag(self, tag, attrs): + if tag == "a": + for a in attrs: + attr, val = a + if attr == "href": + if val.startswith("https://") or val.startswith("http://"): + if not val in self.links: + self.links.append(val) + + def get_links(self, input_html:str): + self.feed(input_html) + return self.links + + +class LinkMap: + + links = [] + link_connections = [ + # (link1, link2) + ] + + def add_link_connection(self, link1, link2): + if not link1 in self.links: + self.links.append(link1) + if not link2 in self.links: + self.links.append(link2) + self.link_connections.append((link1, link2)) + + def add_link(self, link): + if not link in self.links: + self.links.append(link) + + +class LinkMapFromSitelinksGenerator: + + site_request_max_len = 10000000 # bytes + site_request_timeout = 10 # seconds + already_visited = [] + generated_linkmap = LinkMap() + max_links_per_site = 3 + max_depth = 3 + enable_log = False + + def log(self, something): + if self.enable_log: + print(something, file=stderr) + + def _get_html(self, url:str) -> str: + html_content = "" + # receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds + self.log("-----" + url) + response = request_get(url, stream=True, timeout=self.site_request_timeout) + response.raise_for_status() + content_size = 0 + content_chunks = [] + for chunk in response.iter_content(1024, decode_unicode=True): + content_size += len(chunk) + if content_size > self.site_request_max_len: + self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") + break + else: + content_chunks.append(chunk) + html_content = "".join(content_chunks) + return html_content + + def _get_linked_sites(self, url:str): + sites = [] + if not url in self.already_visited: + try: + html = self._get_html(url) + found_links = _HTMLExternalLinkFinder().get_links(html) + found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) + self.log("\n".join(found_links)) + for l in found_links: + if l != None: + sites.append(l) + self.already_visited.append(url) + except KeyboardInterrupt: + exit("KeyboardInterrupt") + except Exception as e: + self.log("An exception occcured while trying to get links from '" + url + "': ") + self.log(e) + return sites + + def _generate_linkmap(self, start_urls:list, _current_depth:int): + linkdict = {} + for url in start_urls: + linkdict[url] = {} + self.generated_linkmap.add_link(url) + linked_sites = self._get_linked_sites(url) + for l in linked_sites: + if l != url: + linkdict[url][l] = {} + self.generated_linkmap.add_link_connection(url, l) + if _current_depth < self.max_depth: + for url in linkdict: + linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1) + + def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): + self.already_visited = [] + self.generated_linkmap = LinkMap() + self.max_links_per_site = max_links_per_site + self.max_depth = max_depth + self._generate_linkmap([start_url], 1) + + def get_linkmap(self) -> LinkMap: + return self.generated_linkmap diff --git a/lib/linkmap2pandasdf.py b/lib/linkmap2pandasdf.py new file mode 100644 index 0000000..8be8200 --- /dev/null +++ b/lib/linkmap2pandasdf.py @@ -0,0 +1,21 @@ + + +from pandas import DataFrame +from .linkmap import LinkMap + + +def linkmap2pandasDF(linkmap:LinkMap) -> DataFrame: + + data_connections = { + "link1": [], + "link2": [] + } + + for c in linkmap.link_connections: + link1, link2 = c + data_connections["link1"].append(link1) + data_connections["link2"].append(link2) + + df = DataFrame(data=data_connections) + + return df diff --git a/linkmapy.py b/linkmapy.py new file mode 100644 index 0000000..768819e --- /dev/null +++ b/linkmapy.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import json + +from sys import stderr + +from lib.args import argparser +from lib.graph import pyvis_graph_from_pandas_DF +from lib.linkmap import LinkMapFromSitelinksGenerator +from lib.linkmap2pandasdf import linkmap2pandasDF + + +if __name__ == "__main__": + + args = argparser.parse_args() + + nm = LinkMapFromSitelinksGenerator() + nm.site_request_max_len = args.http_download_limit + nm.enable_log = args.log + + starturl = args.url + if not (starturl.startswith("https://") or starturl.startswith("http://")): + starturl = "https://" + starturl + + nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site) + + if args.dump: + + print( + "\n".join(str(c) for c in nm.get_linkmap().link_connections) + ) + + else: + + pandasDF = linkmap2pandasDF(nm.get_linkmap()) + pyvis_network_graph = pyvis_graph_from_pandas_DF(pandasDF, heading=starturl) + pyvis_network_graph.show("output.html") diff --git a/pip-dependencies.txt b/pip-dependencies.txt new file mode 100644 index 0000000..d6414ee --- /dev/null +++ b/pip-dependencies.txt @@ -0,0 +1,3 @@ +networkx +pandas +pyvis \ No newline at end of file