mlwerke2pdf/mlwerke2pdf.py

52 lines
1.5 KiB
Python

import requests
from bs4 import BeautifulSoup
import urllib.parse
import pdfkit
class MLWerk:
class SubSite:
def __init__(self, url):
self.url = url
self.getsoup()
def getsoup(self):
r = requests.get(self.url)
r.encoding = 'UTF-8'
self.soup = BeautifulSoup(r.text, 'html.parser')
def __init__(self, url):
self.url = url
self.baseurl = url[:url.rfind('/')] + '/'
self.subsites = []
self.getsoup()
def getsoup(self):
r = requests.get(self.url)
r.encoding = 'UTF-8'
self.soup = BeautifulSoup(r.text, 'html.parser')
def getsubsites(self, limit = None):
i = 0
for link in self.soup.find_all('a'):
if limit != None and i > limit:
break
link = link.get('href')
if not (link == None and ("#" in link or "/" in link)):
self.subsites.append(self.SubSite(urllib.parse.urljoin(self.baseurl, link)))
i += 1
def concatsubsites(self):
for subsite in self.subsites:
for element in subsite.soup.body:
self.soup.body.append(element)
def genpdf(self, path):
pdfkit.from_string(str(self.soup), path, options={"enable-local-file-access": ""})
def text2pdf(self, path):
print("gettings subsites")
self.getsubsites
print("concatting subsites")
self.concatsubsites
print("generating pdf")
self.genpdf(path)