How to return results of web scraping in loop and save to Excel file?

Question

How can I save results of my web scraping to an excel file?

I've tried to find a way for a while. Or are there any ideas for me to be able to select other pages that I want to fetch?

This my code:

from urllib.request import urlopen as req
from openpyxl import Workbook
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from urllib.request import urlopen
import time

import requests 


def checkproduct(url):
   opt = webdriver.ChromeOptions()
   opt.add_argument('headless') 

   driver = webdriver.Chrome()
   #driver = webdriver.Chrome(options=opt)
   driver.get(url)

   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 400);")
   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 1200);")
   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 3000);")
   time.sleep(1)

   page_html = driver.page_source
   data = soup(page_html,'html.parser')

   allproduct = data.findAll('div',{'class':'c16H9d'})
   product_title = allproduct[0].text
   product_url = 'https:'+ allproduct[0].a['href']

   list_title = []
   list_url = []
   list_price = []
   list_image = []

   for pd in allproduct:
    pd_title = pd.text
    pd_url = 'https:' + pd.a['href']
    list_title.append(pd_title)
    list_url.append('https:' + pd.a['href'])


   allprice = data.findAll('span',{'class':'c13VH6'})
   for pc in allprice:
    pc_price = pc.text
    pc_price = pc_price.replace('฿','')
    pc_price = pc_price.replace(',','') 
    list_price.append(float(pc_price))


   allimages = data.findAll('img',{'class':'c1ZEkM'})
   for productimages in allimages:
    productimages_url = productimages['src']
    list_image.append(productimages_url)


   print(list_title)
   print(list_url)
   print(pc_price)
   print(list_image)


   driver.close()


   return(list_title,list_price,list_url,list_image)


base_url = "https://www.lazada.co.th/shop-smart-tv?pages="

n = 2
for i in range(1, n+1):
  response = base_url + "%d" %i
  url = response
  print (url)
  checkproduct(url)
  print ('_________________________')



laptop = checkproduct(url)
excelfile = Workbook()
row = excelfile.active
header = ['Product','Price','URL','Images']
row.append(header)


for i,j,k,l in zip(laptop[0],laptop[1],laptop[2],laptop[3]):
  row.append([i,j,k,l])


  #row['A45'] = 'ถูกสุด'
  #row['A46'] = 'แพงสุด'

  #min_price = min(laptop[1])
  #find = laptop[1].index(min_price)

  #row['B45'] = laptop[0][find]
  #row['C45'] = laptop[1][find]
  #row['D45'] = laptop[2][find]

  #max_price = max(laptop[1])
  #find = laptop[1].index(max_price)

  #row['B46'] = laptop[0][find]
  #row['C46'] = laptop[1][find]
  #row['D46'] = laptop[2][find]
excelfile.save('Lazada_Product2.xlsx')
print('Done')

In this loop it only extracts one set of data into an Excel file, what do I need to do to be able to extract more? Or extract more than 1 page

Can you be more specific about the issue? You've just dumped your entire program. — AMC
– AMC, Commented Feb 8, 2020 at 17:33
If your matter is solved please mark the answer as accepted so that others can see that your question has been answered. — petezurich
– petezurich, Commented Feb 9, 2020 at 7:02

petezurich · Accepted Answer · 2020-02-08 17:31:37Z

The problem is that you don't properly return the results from your function. And your indentation seems broken.

Try this simplified and cleaned code that works fine and saves several hundred entries to an Excel sheet:

from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd

def checkproduct(url):

    driver = webdriver.Chrome()
    driver.get(url)

    driver.execute_script("window.scrollTo(0, 3000);")
    time.sleep(10) 

    page_html = driver.page_source
    data = soup(page_html, 'html.parser')

    allproduct = data.findAll('div', {'class':'c16H9d'})
    product_title = allproduct[0].text
    product_url = 'https:'+ allproduct[0].a['href']

    list_title = []
    list_url = []
    list_price = []
    list_image = []

    for pd in allproduct:
        pd_title = pd.text
        pd_url = 'https:' + pd.a['href']
        list_title.append(pd_title)
        list_url.append('https:' + pd.a['href'])

    allprice = data.findAll('span',{'class':'c13VH6'})

    for pc in allprice:
        pc_price = pc.text
        pc_price = pc_price.replace('฿','')
        pc_price = pc_price.replace(',','') 
        list_price.append(float(pc_price))

    allimages = data.findAll('img',{'class':'c1ZEkM'})

    for productimages in allimages:
        productimages_url = productimages['src']
        list_image.append(productimages_url)

    driver.close()

    return([list_title, list_price, list_url, list_image])

base_url = "https://www.lazada.co.th/shop-smart-tv?pages="

n = 3
rows = []

for i in range(1, n+1):
    response = base_url + f"{i}"
    url = response
    print (url)
    # you need to save the returned values from your function!
    results = checkproduct(url)
    rows.append(pd.DataFrame(results).T)

df = pd.concat(rows).reset_index(drop=True)
df.columns = ['Product','Price','URL','Images']
df.to_excel("Lazada_Product.xlsx")

Be aware that I use a Pandas dataframe for easy data manipulation and saving.

Thank, Patezuich you have me a lot. But after trying to run. I found that the information it received was like the same page. Do you have any solutions?
You're welcome. And are you sure? The script saves all scraped data from all pages to one single Excel sheet.
Sure, This script can't scrape to other pages, in Excel File on save 1 pages loop. Do you have any solutions?
I just checked again and can't confirm that. The script scrapes exactly 40 samples per page. If I set n=1 I get 40 samples. If I set n=2 I get 80 samples and so on. Again: Please be aware that all data is saved to only one single Excel sheet. It's not 1 sheet per webpage. But again: You get all the data from the pages.

2 revs, 2 users 86% · Accepted Answer · 2024-10-28 20:35:17Z

This is code to fetch data from website & export data in to Excel with python simple code.

You may need to install required dependency's with pip command.

pip install requests
pip install bs4
pip install selenium
pip install pandas
pip install openpyxl
pip install xlsxwriter

Any help required with this code you can connect with me over mail

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
table = soup.find_all('table')[0] # You can set index for table 0 or 1 or 2 as in webpage there are total 3 tables & having same classname so far.
#print(soup)

world_titles = table.find_all('th')

word_table_titles = [title.text.strip() for title in world_titles]
#print(word_table_titles)

df = pd.DataFrame(columns = word_table_titles)

column_data = table.find_all('tr')

for row in column_data[1:]:
    row_data = row.find_all('td')
    individualRowData = [data.text.strip() for data in row_data]
    length = len(df)
    df.loc[length] = individualRowData
    #print(individualRowData)

from datetime import datetime
#current_working_directory = os.getcwd()
#print(current_working_directory)
#df.to_xlsx(r'/storage/emulated/0/Python Programming',index = False)
filename = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
with pd.ExcelWriter( filename + ' Output.xlsx') as writer:
    df.to_excel(writer, index = False)

Collectives™ on Stack Overflow

How to return results of web scraping in loop and save to Excel file?

2 Answers 2

4 Comments

Comments

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

4 Comments

Comments

Related