The below code will read a downloaded HTML file. Uses BeautifulSoup and lxml to parse the document and extract all the URLs present in the document.
The function should return full URL, present with the domain, and no duplicates (even links with # are split and only the first part is taken). I may want to run this script 1000 times a day, efficiency is also important.
class Generate(object):
def __init__(self, dirname):
self.BASE_DIR = '{}/{}'.format(
os.path.abspath(os.path.join(os.getcwd(), os.pardir)), dirname)
def download(self, urlx, filename):
filepath = '{}/{}.html'.format(self.BASE_DIR, filename)
if not os.path.isfile(filepath):
data = subprocess.call(
['wget', urlx, '-O', '{}'.format(filepath)],
stdout=subprocess.DEVNULL)
return filepath
def url_formatter(self, url, starturl):
if starturl.endswith('/'):
starturl = starturl[:-1]
if '#' in url:
url = url.split('#')[0]
if url.startswith('/'):
url = '{}{}'.format(starturl, url)
if url.endswith('/'):
url = url[:-1]
if url.startswith('http'):
return url
else:
return None
def url_lister(self, main_url, filename, starturl, domain=False):
startx = time.time()
filepath = self.download(main_url, filename)
data = open(filepath, 'rt').read()
soup = BS(data, 'lxml')
href_links = []
for link in soup.find_all('a', href=True):
url = self.url_formatter(link['href'], starturl)
if url is not None:
if domain is not False and domain in url:
href_links.append(url)
elif domain is False:
href_links.append(url)
print(time.time() - startx)
return sorted(list(set(href_links)))