I've written a script in python using "threading" module to scrape two sites simultaneously. It parses the two sites flawlessly. Any insight as to how I can improve this script will be appreciated.
Here is what I did:
import requests ; from lxml import html
import threading ; import time
Yp_link = "https://www.yellowpages.com/search?search_terms=coffee&geo_location_terms=Los%20Angeles%2C%20CA&page=2"
Tuts_link = "http://www.wiseowl.co.uk/videos/"
def create_links(url):
response = requests.get(url).text
tree = html.fromstring(response)
for title in tree.cssselect("div.info"):
name = title.cssselect("a.business-name span[itemprop=name]")[0].text
street = title.cssselect("span.street-address")[0].text
phone = title.cssselect("div[itemprop=telephone]")[0].text if title.cssselect("div[itemprop=telephone]") else ""
time.sleep(1)
print(name, street, phone)
def process_links(link):
response = requests.get(link).text
tree = html.fromstring(response)
for titles in tree.xpath("//p[@class='woVideoListDefaultSeriesTitle']"):
title = titles.xpath('.//a')[0]
time.sleep(1)
print(title.text, title.attrib['href'])
th1 = threading.Thread(target=create_links, args=(Yp_link,))
th2 = threading.Thread(target=process_links, args=(Tuts_link,))
th1.start()
th2.start()
th1.join()
th2.join()
https://www.yellowpages.com/robots.txtandhttp://www.wiseowl.co.uk/robots.txt\$\endgroup\$