Compare commits

...

7 commits
1 ... main

Author SHA1 Message Date
f0b5f56e0d Changed Username 2023-02-08 19:35:46 +00:00
4c3de294bf Merge branch 'dev' into 'main'
Remove incorrect list of dependencies from README

See merge request W13R/linkmapy!2
2022-08-13 11:32:50 +00:00
b1a481881f Remove incorrect list of dependencies from README 2022-08-13 13:32:09 +02:00
4986df4106 Merge branch 'dev' into 'main'
Refactoring, Change to httpx and asyncio

See merge request W13R/linkmapy!1
2022-08-13 11:27:03 +00:00
2d2afe7930 Refactored, changed to httx and async instead of requests and threads 2022-08-13 13:19:06 +02:00
56b850dc4f Add LICENSE 2022-08-13 09:51:42 +02:00
f3074f4729 Rename pip-dependencies.txt to requirements.txt 2022-08-13 09:41:12 +02:00
8 changed files with 107 additions and 95 deletions

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2022 Julian Müller (ChaoticByte)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -8,13 +8,10 @@ Create a graph from the links on a website and the following sites.
### Pip Dependencies ### Pip Dependencies
- pyvis
- requests
You can install those dependencies with You can install those dependencies with
``` ```
pip install -r pip-dependencies.txt pip install -r requirements.txt
``` ```
## Usage ## Usage

View file

@ -2,9 +2,7 @@
from argparse import ArgumentParser from argparse import ArgumentParser
argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)") argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)")
argparser.add_argument("url", help="The URL of the site you want to start from") argparser.add_argument("url", help="The URL of the site you want to start from")
argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit") argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit")
argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3) argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3)
argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3) argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3)

View file

@ -1,14 +1,11 @@
from pyvis.network import Network from pyvis.network import Network
from .linkmap import LinkMap from .linkmap import LinkMap
def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network: def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True) pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True)
pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links)) pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links))
pyvis_net.add_edges(linkmap.link_connections) pyvis_net.add_edges(linkmap.link_connections)
@ -66,6 +63,5 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
# pyvis_net.show_buttons() # pyvis_net.show_buttons()
pyvis_net.set_options(pyvis_options) pyvis_net.set_options(pyvis_options)
return pyvis_net return pyvis_net

View file

@ -1,27 +1,30 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from html.parser import HTMLParser from html.parser import HTMLParser
from random import sample from random import sample
from requests import get as request_get
from sys import stderr from sys import stderr
from urllib.parse import urlparse from urllib.parse import urlparse
from httpx import AsyncClient
from .linkmap import LinkMap from .linkmap import LinkMap
class _HTMLExternalLinkFinder(HTMLParser): class HTMLLinkFinder(HTMLParser):
links = [] links = []
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if tag == "a": if tag != "a":
for a in attrs: return
attr, val = a for a in attrs:
if attr == "href": attr, val = a
if val.startswith("https://") or val.startswith("http://"): if attr != "href":
if not val in self.links: continue
self.links.append(val) if val.startswith("https://") or val.startswith("http://"):
if not val in self.links:
self.links.append(val)
def get_links(self, input_html:str): def get_links(self, input_html:str):
self.feed(input_html) self.feed(input_html)
@ -35,80 +38,67 @@ class LinkMapFromSitelinksGenerator:
generated_linkmap = LinkMap() generated_linkmap = LinkMap()
max_links_per_site = 3 max_links_per_site = 3
max_depth = 3 max_depth = 3
max_threads = 4
enable_log = False enable_log = False
def log(self, something): def log(self, something):
if self.enable_log: if self.enable_log:
print(something, file=stderr) print(something, file=stderr)
def _get_html(self, url:str) -> str: async def _get_html(self, url:str, client:AsyncClient) -> str:
html_content = "" content = bytearray()
# receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds
self.log("----" + url)
response = request_get(url, stream=True, timeout=self.site_request_timeout)
response.raise_for_status()
content_size = 0 content_size = 0
content_chunks = [] # receive up to self.site_request_max_len bytes after
for chunk in response.iter_content(1024, decode_unicode=True): # a maximum of self.site_request_timeout seconds
content_size += len(chunk) self.log(f"Request: {url}")
if content_size > self.site_request_max_len: async with client.stream(
self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") "GET",
break url,
else: timeout=self.site_request_timeout,
content_chunks.append(chunk) follow_redirects=True
html_content = "".join(content_chunks) ) as stream:
async for chunk in stream.aiter_bytes(1024):
content_size += len(chunk)
if content_size > self.site_request_max_len:
self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
break
else:
content.extend(chunk)
# decode
try:
html_content = content.decode()
except UnicodeDecodeError:
self.log(f"Couldn't decode {url}")
html_content = ""
return html_content return html_content
def _get_linked_sites_thread(self, urls:list): async def _get_linked_sites_coro(self, url, client:AsyncClient):
def _get_links(url:str): linked_sites = []
sites = [] try:
try: html = await self._get_html(url, client)
html = self._get_html(url) found_links = HTMLLinkFinder().get_links(html)
found_links = _HTMLExternalLinkFinder().get_links(html) found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) for l in found_links:
self.log("\n".join(found_links)) self.log(f"Found {l}")
for l in found_links: if l != None:
if l != None: linked_sites.append(l)
sites.append(l) except KeyboardInterrupt:
except KeyboardInterrupt: exit("KeyboardInterrupt")
exit("KeyboardInterrupt") except Exception as e:
except Exception as e: self.log("An exception occcured while trying to get links from '" + url + "': ")
self.log("An exception occcured while trying to get links from '" + url + "': ") self.log(e)
self.log(e) return url, linked_sites
return sites
links = {}
for url in urls:
links[url] = _get_links(url)
return links
def _get_linked_sites(self, urls:list): async def _get_linked_sites(self, urls:list, client:AsyncClient):
# split urls into self.max_threads chunks # get results
urlchunks = [] results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls])
chunk_size = max(int(len(urls) / self.max_threads), 1) results_as_dict = {}
for i in range(self.max_threads): for url, links in results:
start = i*chunk_size results_as_dict[url] = links
end = (i*chunk_size)+chunk_size return results_as_dict
new_chunk = urls[start:end]
if len(new_chunk) > 0:
urlchunks.append(new_chunk)
results = []
# threads
with ThreadPoolExecutor() as tpe:
self.log(f"--Using {len(urlchunks)} concurrent connections...")
futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks]
for f in futures:
# wait for results
results.append(f.result())
results_combined = {}
for result_chunk in results:
for url in result_chunk:
results_combined[url] = result_chunk[url]
return results_combined
def _generate_linkmap(self, start_urls:list, _current_depth:int): async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient):
linkdict = {} linkdict = {}
linked_sites = self._get_linked_sites(start_urls) linked_sites = await self._get_linked_sites(start_urls, client)
for url in linked_sites: for url in linked_sites:
linkdict[url] = {} linkdict[url] = {}
self.generated_linkmap.add_link(url) self.generated_linkmap.add_link(url)
@ -118,13 +108,14 @@ class LinkMapFromSitelinksGenerator:
self.generated_linkmap.add_link_connection(url, l)# self.generated_linkmap.add_link_connection(url, l)#
if _current_depth < self.max_depth: if _current_depth < self.max_depth:
for url in linkdict: for url in linkdict:
linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1) linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client)
def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
self.generated_linkmap = LinkMap() self.generated_linkmap = LinkMap()
self.max_links_per_site = max_links_per_site self.max_links_per_site = max_links_per_site
self.max_depth = max_depth self.max_depth = max_depth
self._generate_linkmap([start_url], 1) async with AsyncClient() as client:
await self._generate_linkmap([start_url], 1, client)
def get_linkmap(self) -> LinkMap: def get_linkmap(self) -> LinkMap:
return self.generated_linkmap return self.generated_linkmap

View file

@ -1,13 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright (c) 2021 Julian Müller (ChaoticByte)
import asyncio
from lib.args import argparser from lib.args import argparser
from lib.graph import pyvis_graph_from_linkmap from lib.graph import pyvis_graph_from_linkmap
from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator
if __name__ == "__main__":
args = argparser.parse_args()
async def main(args):
nm = LinkMapFromSitelinksGenerator() nm = LinkMapFromSitelinksGenerator()
nm.site_request_max_len = args.http_download_limit nm.site_request_max_len = args.http_download_limit
nm.enable_log = args.log nm.enable_log = args.log
@ -16,15 +19,21 @@ if __name__ == "__main__":
if not (starturl.startswith("https://") or starturl.startswith("http://")): if not (starturl.startswith("https://") or starturl.startswith("http://")):
starturl = "https://" + starturl starturl = "https://" + starturl
nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site) await nm.generate(
starturl,
max_depth=args.max_depth,
max_links_per_site=args.max_links_per_site
)
if args.dump: if args.dump:
print( print(
"\n".join(str(c) for c in nm.get_linkmap().link_connections) "\n".join(str(c) for c in nm.get_linkmap().link_connections)
) )
else: else:
pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl) pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl)
pyvis_network_graph.show("output.html") pyvis_network_graph.show("output.html")
if __name__ == "__main__":
args = argparser.parse_args()
asyncio.run(main(args))

View file

@ -1,2 +0,0 @@
pyvis
requests

2
requirements.txt Normal file
View file

@ -0,0 +1,2 @@
pyvis
httpx