I've written a script in python which is able to collect links of posts and then fetch the title of each post by going one layer deep from the target page.
I've applied @get_links decorator which scrapes the titles from its inner page.
However, I wish to get any suggestion to improve my existing approach keeping the decorator within as I'm very new to work with it.
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://stackoverflow.com/questions/tagged/web-scraping"
def get_links(func):
def get_target_link(*args,**kwargs):
titles = []
for link in func(*args,**kwargs):
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
title = soup.select_one("h1[itemprop='name'] a").text
titles.append(title)
return titles
return get_target_link
@get_links
def get_info(link):
ilink = []
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
for items in soup.select(".summary .question-hyperlink"):
ilink.append(urljoin(url,items.get('href')))
return ilink
if __name__ == '__main__':
print(get_info(url))
check_paginationis supposed to help from the code itself, can you explain its purpose in more details, please? \$\endgroup\$