Compare commits
7 commits
Author | SHA1 | Date | |
---|---|---|---|
f0b5f56e0d | |||
4c3de294bf | |||
b1a481881f | |||
4986df4106 | |||
2d2afe7930 | |||
56b850dc4f | |||
f3074f4729 |
8 changed files with 107 additions and 95 deletions
21
LICENSE
Normal file
21
LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2022 Julian Müller (ChaoticByte)
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
|
@ -8,13 +8,10 @@ Create a graph from the links on a website and the following sites.
|
||||||
|
|
||||||
### Pip Dependencies
|
### Pip Dependencies
|
||||||
|
|
||||||
- pyvis
|
|
||||||
- requests
|
|
||||||
|
|
||||||
You can install those dependencies with
|
You can install those dependencies with
|
||||||
|
|
||||||
```
|
```
|
||||||
pip install -r pip-dependencies.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
|
@ -2,9 +2,7 @@
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)")
|
argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)")
|
||||||
|
|
||||||
argparser.add_argument("url", help="The URL of the site you want to start from")
|
argparser.add_argument("url", help="The URL of the site you want to start from")
|
||||||
|
|
||||||
argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit")
|
argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit")
|
||||||
argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3)
|
argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3)
|
||||||
argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3)
|
argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3)
|
||||||
|
|
|
@ -1,14 +1,11 @@
|
||||||
|
|
||||||
|
|
||||||
from pyvis.network import Network
|
from pyvis.network import Network
|
||||||
|
|
||||||
from .linkmap import LinkMap
|
from .linkmap import LinkMap
|
||||||
|
|
||||||
|
|
||||||
def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
|
def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
|
||||||
|
|
||||||
pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True)
|
pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True)
|
||||||
|
|
||||||
pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links))
|
pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links))
|
||||||
pyvis_net.add_edges(linkmap.link_connections)
|
pyvis_net.add_edges(linkmap.link_connections)
|
||||||
|
|
||||||
|
@ -66,6 +63,5 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
|
||||||
|
|
||||||
# pyvis_net.show_buttons()
|
# pyvis_net.show_buttons()
|
||||||
pyvis_net.set_options(pyvis_options)
|
pyvis_net.set_options(pyvis_options)
|
||||||
|
|
||||||
return pyvis_net
|
return pyvis_net
|
||||||
|
|
||||||
|
|
|
@ -1,24 +1,27 @@
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from random import sample
|
from random import sample
|
||||||
from requests import get as request_get
|
|
||||||
from sys import stderr
|
from sys import stderr
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from httpx import AsyncClient
|
||||||
|
|
||||||
from .linkmap import LinkMap
|
from .linkmap import LinkMap
|
||||||
|
|
||||||
|
|
||||||
class _HTMLExternalLinkFinder(HTMLParser):
|
class HTMLLinkFinder(HTMLParser):
|
||||||
|
|
||||||
links = []
|
links = []
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
if tag == "a":
|
if tag != "a":
|
||||||
|
return
|
||||||
for a in attrs:
|
for a in attrs:
|
||||||
attr, val = a
|
attr, val = a
|
||||||
if attr == "href":
|
if attr != "href":
|
||||||
|
continue
|
||||||
if val.startswith("https://") or val.startswith("http://"):
|
if val.startswith("https://") or val.startswith("http://"):
|
||||||
if not val in self.links:
|
if not val in self.links:
|
||||||
self.links.append(val)
|
self.links.append(val)
|
||||||
|
@ -35,80 +38,67 @@ class LinkMapFromSitelinksGenerator:
|
||||||
generated_linkmap = LinkMap()
|
generated_linkmap = LinkMap()
|
||||||
max_links_per_site = 3
|
max_links_per_site = 3
|
||||||
max_depth = 3
|
max_depth = 3
|
||||||
max_threads = 4
|
|
||||||
enable_log = False
|
enable_log = False
|
||||||
|
|
||||||
def log(self, something):
|
def log(self, something):
|
||||||
if self.enable_log:
|
if self.enable_log:
|
||||||
print(something, file=stderr)
|
print(something, file=stderr)
|
||||||
|
|
||||||
def _get_html(self, url:str) -> str:
|
async def _get_html(self, url:str, client:AsyncClient) -> str:
|
||||||
html_content = ""
|
content = bytearray()
|
||||||
# receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds
|
|
||||||
self.log("----" + url)
|
|
||||||
response = request_get(url, stream=True, timeout=self.site_request_timeout)
|
|
||||||
response.raise_for_status()
|
|
||||||
content_size = 0
|
content_size = 0
|
||||||
content_chunks = []
|
# receive up to self.site_request_max_len bytes after
|
||||||
for chunk in response.iter_content(1024, decode_unicode=True):
|
# a maximum of self.site_request_timeout seconds
|
||||||
|
self.log(f"Request: {url}")
|
||||||
|
async with client.stream(
|
||||||
|
"GET",
|
||||||
|
url,
|
||||||
|
timeout=self.site_request_timeout,
|
||||||
|
follow_redirects=True
|
||||||
|
) as stream:
|
||||||
|
async for chunk in stream.aiter_bytes(1024):
|
||||||
content_size += len(chunk)
|
content_size += len(chunk)
|
||||||
if content_size > self.site_request_max_len:
|
if content_size > self.site_request_max_len:
|
||||||
self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
|
self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
content_chunks.append(chunk)
|
content.extend(chunk)
|
||||||
html_content = "".join(content_chunks)
|
# decode
|
||||||
|
try:
|
||||||
|
html_content = content.decode()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
self.log(f"Couldn't decode {url}")
|
||||||
|
html_content = ""
|
||||||
return html_content
|
return html_content
|
||||||
|
|
||||||
def _get_linked_sites_thread(self, urls:list):
|
async def _get_linked_sites_coro(self, url, client:AsyncClient):
|
||||||
def _get_links(url:str):
|
linked_sites = []
|
||||||
sites = []
|
|
||||||
try:
|
try:
|
||||||
html = self._get_html(url)
|
html = await self._get_html(url, client)
|
||||||
found_links = _HTMLExternalLinkFinder().get_links(html)
|
found_links = HTMLLinkFinder().get_links(html)
|
||||||
found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
|
found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
|
||||||
self.log("\n".join(found_links))
|
|
||||||
for l in found_links:
|
for l in found_links:
|
||||||
|
self.log(f"Found {l}")
|
||||||
if l != None:
|
if l != None:
|
||||||
sites.append(l)
|
linked_sites.append(l)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
exit("KeyboardInterrupt")
|
exit("KeyboardInterrupt")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log("An exception occcured while trying to get links from '" + url + "': ")
|
self.log("An exception occcured while trying to get links from '" + url + "': ")
|
||||||
self.log(e)
|
self.log(e)
|
||||||
return sites
|
return url, linked_sites
|
||||||
links = {}
|
|
||||||
for url in urls:
|
|
||||||
links[url] = _get_links(url)
|
|
||||||
return links
|
|
||||||
|
|
||||||
def _get_linked_sites(self, urls:list):
|
async def _get_linked_sites(self, urls:list, client:AsyncClient):
|
||||||
# split urls into self.max_threads chunks
|
# get results
|
||||||
urlchunks = []
|
results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls])
|
||||||
chunk_size = max(int(len(urls) / self.max_threads), 1)
|
results_as_dict = {}
|
||||||
for i in range(self.max_threads):
|
for url, links in results:
|
||||||
start = i*chunk_size
|
results_as_dict[url] = links
|
||||||
end = (i*chunk_size)+chunk_size
|
return results_as_dict
|
||||||
new_chunk = urls[start:end]
|
|
||||||
if len(new_chunk) > 0:
|
|
||||||
urlchunks.append(new_chunk)
|
|
||||||
results = []
|
|
||||||
# threads
|
|
||||||
with ThreadPoolExecutor() as tpe:
|
|
||||||
self.log(f"--Using {len(urlchunks)} concurrent connections...")
|
|
||||||
futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks]
|
|
||||||
for f in futures:
|
|
||||||
# wait for results
|
|
||||||
results.append(f.result())
|
|
||||||
results_combined = {}
|
|
||||||
for result_chunk in results:
|
|
||||||
for url in result_chunk:
|
|
||||||
results_combined[url] = result_chunk[url]
|
|
||||||
return results_combined
|
|
||||||
|
|
||||||
def _generate_linkmap(self, start_urls:list, _current_depth:int):
|
async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient):
|
||||||
linkdict = {}
|
linkdict = {}
|
||||||
linked_sites = self._get_linked_sites(start_urls)
|
linked_sites = await self._get_linked_sites(start_urls, client)
|
||||||
for url in linked_sites:
|
for url in linked_sites:
|
||||||
linkdict[url] = {}
|
linkdict[url] = {}
|
||||||
self.generated_linkmap.add_link(url)
|
self.generated_linkmap.add_link(url)
|
||||||
|
@ -118,13 +108,14 @@ class LinkMapFromSitelinksGenerator:
|
||||||
self.generated_linkmap.add_link_connection(url, l)#
|
self.generated_linkmap.add_link_connection(url, l)#
|
||||||
if _current_depth < self.max_depth:
|
if _current_depth < self.max_depth:
|
||||||
for url in linkdict:
|
for url in linkdict:
|
||||||
linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1)
|
linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client)
|
||||||
|
|
||||||
def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
|
async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
|
||||||
self.generated_linkmap = LinkMap()
|
self.generated_linkmap = LinkMap()
|
||||||
self.max_links_per_site = max_links_per_site
|
self.max_links_per_site = max_links_per_site
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
self._generate_linkmap([start_url], 1)
|
async with AsyncClient() as client:
|
||||||
|
await self._generate_linkmap([start_url], 1, client)
|
||||||
|
|
||||||
def get_linkmap(self) -> LinkMap:
|
def get_linkmap(self) -> LinkMap:
|
||||||
return self.generated_linkmap
|
return self.generated_linkmap
|
||||||
|
|
23
linkmapy.py
23
linkmapy.py
|
@ -1,13 +1,16 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Copyright (c) 2021 Julian Müller (ChaoticByte)
|
||||||
|
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
|
||||||
from lib.args import argparser
|
from lib.args import argparser
|
||||||
from lib.graph import pyvis_graph_from_linkmap
|
from lib.graph import pyvis_graph_from_linkmap
|
||||||
from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator
|
from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
args = argparser.parse_args()
|
|
||||||
|
|
||||||
|
async def main(args):
|
||||||
nm = LinkMapFromSitelinksGenerator()
|
nm = LinkMapFromSitelinksGenerator()
|
||||||
nm.site_request_max_len = args.http_download_limit
|
nm.site_request_max_len = args.http_download_limit
|
||||||
nm.enable_log = args.log
|
nm.enable_log = args.log
|
||||||
|
@ -16,15 +19,21 @@ if __name__ == "__main__":
|
||||||
if not (starturl.startswith("https://") or starturl.startswith("http://")):
|
if not (starturl.startswith("https://") or starturl.startswith("http://")):
|
||||||
starturl = "https://" + starturl
|
starturl = "https://" + starturl
|
||||||
|
|
||||||
nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site)
|
await nm.generate(
|
||||||
|
starturl,
|
||||||
|
max_depth=args.max_depth,
|
||||||
|
max_links_per_site=args.max_links_per_site
|
||||||
|
)
|
||||||
|
|
||||||
if args.dump:
|
if args.dump:
|
||||||
|
|
||||||
print(
|
print(
|
||||||
"\n".join(str(c) for c in nm.get_linkmap().link_connections)
|
"\n".join(str(c) for c in nm.get_linkmap().link_connections)
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl)
|
pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl)
|
||||||
pyvis_network_graph.show("output.html")
|
pyvis_network_graph.show("output.html")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = argparser.parse_args()
|
||||||
|
asyncio.run(main(args))
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
pyvis
|
|
||||||
requests
|
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
pyvis
|
||||||
|
httpx
|
Reference in a new issue