Add existing project files
This commit is contained in:
parent
432b0ec10f
commit
1f02b8d265
7 changed files with 263 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
output.html
|
||||||
|
__pycache__
|
12
lib/args.py
Normal file
12
lib/args.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)")
|
||||||
|
|
||||||
|
argparser.add_argument("url", help="The URL of the site you want to start from")
|
||||||
|
|
||||||
|
argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit")
|
||||||
|
argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3)
|
||||||
|
argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3)
|
||||||
|
argparser.add_argument("--http-download-limit", metavar="NBYTES", type=int, help="The maximum length of a requested html file download (in bytes) (default: 10000000)", default=10000000)
|
||||||
|
argparser.add_argument("--log", action="store_true", default=False, help="Log all visited sites and links to stderr")
|
68
lib/graph.py
Normal file
68
lib/graph.py
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
|
||||||
|
from networkx import from_pandas_edgelist
|
||||||
|
from pandas import DataFrame
|
||||||
|
from pyvis.network import Network
|
||||||
|
|
||||||
|
def pyvis_graph_from_pandas_DF(pandas_df:DataFrame, source_column:str="link1", target_column:str="link2", heading:str=None) -> Network:
|
||||||
|
|
||||||
|
nx = from_pandas_edgelist(pandas_df, source=source_column, target=target_column)
|
||||||
|
pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%")
|
||||||
|
pyvis_net.from_nx(nx, default_node_size=8)
|
||||||
|
|
||||||
|
if heading != None:
|
||||||
|
pyvis_net.heading = heading + """
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
background-color: #222222;
|
||||||
|
}
|
||||||
|
h1 {
|
||||||
|
font-size: 1.15rem;
|
||||||
|
margin: .5rem;
|
||||||
|
color: #fafafa;
|
||||||
|
font-family: sans-serif;
|
||||||
|
}
|
||||||
|
#mynetwork {
|
||||||
|
border: none !important;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
"""
|
||||||
|
|
||||||
|
pyvis_options = """
|
||||||
|
var options = {
|
||||||
|
"nodes": {
|
||||||
|
"font": {
|
||||||
|
"size": 12
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"edges": {
|
||||||
|
"arrows": {
|
||||||
|
"to": {
|
||||||
|
"enabled": true,
|
||||||
|
"scaleFactor": 0.3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"inherit": true
|
||||||
|
},
|
||||||
|
"smooth": false
|
||||||
|
},
|
||||||
|
"interaction": {
|
||||||
|
"hover": false
|
||||||
|
},
|
||||||
|
"physics": {
|
||||||
|
"barnesHut": {
|
||||||
|
"centralGravity": 0,
|
||||||
|
"springLength": 200,
|
||||||
|
"springConstant": 0.01,
|
||||||
|
"avoidOverlap": 0
|
||||||
|
},
|
||||||
|
"minVelocity": 0.75
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# pyvis_net.show_buttons()
|
||||||
|
pyvis_net.set_options(pyvis_options)
|
||||||
|
|
||||||
|
return pyvis_net
|
||||||
|
|
120
lib/linkmap.py
Normal file
120
lib/linkmap.py
Normal file
|
@ -0,0 +1,120 @@
|
||||||
|
|
||||||
|
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from random import sample
|
||||||
|
from requests import get as request_get
|
||||||
|
from sys import stderr
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
class _HTMLExternalLinkFinder(HTMLParser):
|
||||||
|
|
||||||
|
links = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag == "a":
|
||||||
|
for a in attrs:
|
||||||
|
attr, val = a
|
||||||
|
if attr == "href":
|
||||||
|
if val.startswith("https://") or val.startswith("http://"):
|
||||||
|
if not val in self.links:
|
||||||
|
self.links.append(val)
|
||||||
|
|
||||||
|
def get_links(self, input_html:str):
|
||||||
|
self.feed(input_html)
|
||||||
|
return self.links
|
||||||
|
|
||||||
|
|
||||||
|
class LinkMap:
|
||||||
|
|
||||||
|
links = []
|
||||||
|
link_connections = [
|
||||||
|
# (link1, link2)
|
||||||
|
]
|
||||||
|
|
||||||
|
def add_link_connection(self, link1, link2):
|
||||||
|
if not link1 in self.links:
|
||||||
|
self.links.append(link1)
|
||||||
|
if not link2 in self.links:
|
||||||
|
self.links.append(link2)
|
||||||
|
self.link_connections.append((link1, link2))
|
||||||
|
|
||||||
|
def add_link(self, link):
|
||||||
|
if not link in self.links:
|
||||||
|
self.links.append(link)
|
||||||
|
|
||||||
|
|
||||||
|
class LinkMapFromSitelinksGenerator:
|
||||||
|
|
||||||
|
site_request_max_len = 10000000 # bytes
|
||||||
|
site_request_timeout = 10 # seconds
|
||||||
|
already_visited = []
|
||||||
|
generated_linkmap = LinkMap()
|
||||||
|
max_links_per_site = 3
|
||||||
|
max_depth = 3
|
||||||
|
enable_log = False
|
||||||
|
|
||||||
|
def log(self, something):
|
||||||
|
if self.enable_log:
|
||||||
|
print(something, file=stderr)
|
||||||
|
|
||||||
|
def _get_html(self, url:str) -> str:
|
||||||
|
html_content = ""
|
||||||
|
# receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds
|
||||||
|
self.log("-----" + url)
|
||||||
|
response = request_get(url, stream=True, timeout=self.site_request_timeout)
|
||||||
|
response.raise_for_status()
|
||||||
|
content_size = 0
|
||||||
|
content_chunks = []
|
||||||
|
for chunk in response.iter_content(1024, decode_unicode=True):
|
||||||
|
content_size += len(chunk)
|
||||||
|
if content_size > self.site_request_max_len:
|
||||||
|
self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
content_chunks.append(chunk)
|
||||||
|
html_content = "".join(content_chunks)
|
||||||
|
return html_content
|
||||||
|
|
||||||
|
def _get_linked_sites(self, url:str):
|
||||||
|
sites = []
|
||||||
|
if not url in self.already_visited:
|
||||||
|
try:
|
||||||
|
html = self._get_html(url)
|
||||||
|
found_links = _HTMLExternalLinkFinder().get_links(html)
|
||||||
|
found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
|
||||||
|
self.log("\n".join(found_links))
|
||||||
|
for l in found_links:
|
||||||
|
if l != None:
|
||||||
|
sites.append(l)
|
||||||
|
self.already_visited.append(url)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
exit("KeyboardInterrupt")
|
||||||
|
except Exception as e:
|
||||||
|
self.log("An exception occcured while trying to get links from '" + url + "': ")
|
||||||
|
self.log(e)
|
||||||
|
return sites
|
||||||
|
|
||||||
|
def _generate_linkmap(self, start_urls:list, _current_depth:int):
|
||||||
|
linkdict = {}
|
||||||
|
for url in start_urls:
|
||||||
|
linkdict[url] = {}
|
||||||
|
self.generated_linkmap.add_link(url)
|
||||||
|
linked_sites = self._get_linked_sites(url)
|
||||||
|
for l in linked_sites:
|
||||||
|
if l != url:
|
||||||
|
linkdict[url][l] = {}
|
||||||
|
self.generated_linkmap.add_link_connection(url, l)
|
||||||
|
if _current_depth < self.max_depth:
|
||||||
|
for url in linkdict:
|
||||||
|
linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1)
|
||||||
|
|
||||||
|
def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
|
||||||
|
self.already_visited = []
|
||||||
|
self.generated_linkmap = LinkMap()
|
||||||
|
self.max_links_per_site = max_links_per_site
|
||||||
|
self.max_depth = max_depth
|
||||||
|
self._generate_linkmap([start_url], 1)
|
||||||
|
|
||||||
|
def get_linkmap(self) -> LinkMap:
|
||||||
|
return self.generated_linkmap
|
21
lib/linkmap2pandasdf.py
Normal file
21
lib/linkmap2pandasdf.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
|
||||||
|
|
||||||
|
from pandas import DataFrame
|
||||||
|
from .linkmap import LinkMap
|
||||||
|
|
||||||
|
|
||||||
|
def linkmap2pandasDF(linkmap:LinkMap) -> DataFrame:
|
||||||
|
|
||||||
|
data_connections = {
|
||||||
|
"link1": [],
|
||||||
|
"link2": []
|
||||||
|
}
|
||||||
|
|
||||||
|
for c in linkmap.link_connections:
|
||||||
|
link1, link2 = c
|
||||||
|
data_connections["link1"].append(link1)
|
||||||
|
data_connections["link2"].append(link2)
|
||||||
|
|
||||||
|
df = DataFrame(data=data_connections)
|
||||||
|
|
||||||
|
return df
|
37
linkmapy.py
Normal file
37
linkmapy.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from sys import stderr
|
||||||
|
|
||||||
|
from lib.args import argparser
|
||||||
|
from lib.graph import pyvis_graph_from_pandas_DF
|
||||||
|
from lib.linkmap import LinkMapFromSitelinksGenerator
|
||||||
|
from lib.linkmap2pandasdf import linkmap2pandasDF
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
args = argparser.parse_args()
|
||||||
|
|
||||||
|
nm = LinkMapFromSitelinksGenerator()
|
||||||
|
nm.site_request_max_len = args.http_download_limit
|
||||||
|
nm.enable_log = args.log
|
||||||
|
|
||||||
|
starturl = args.url
|
||||||
|
if not (starturl.startswith("https://") or starturl.startswith("http://")):
|
||||||
|
starturl = "https://" + starturl
|
||||||
|
|
||||||
|
nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site)
|
||||||
|
|
||||||
|
if args.dump:
|
||||||
|
|
||||||
|
print(
|
||||||
|
"\n".join(str(c) for c in nm.get_linkmap().link_connections)
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
pandasDF = linkmap2pandasDF(nm.get_linkmap())
|
||||||
|
pyvis_network_graph = pyvis_graph_from_pandas_DF(pandasDF, heading=starturl)
|
||||||
|
pyvis_network_graph.show("output.html")
|
3
pip-dependencies.txt
Normal file
3
pip-dependencies.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
networkx
|
||||||
|
pandas
|
||||||
|
pyvis
|
Reference in a new issue