2

How can I save results of my web scraping to an excel file?

I've tried to find a way for a while. Or are there any ideas for me to be able to select other pages that I want to fetch?

This my code:

from urllib.request import urlopen as req
from openpyxl import Workbook
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from urllib.request import urlopen
import time

import requests 


def checkproduct(url):
   opt = webdriver.ChromeOptions()
   opt.add_argument('headless') 

   driver = webdriver.Chrome()
   #driver = webdriver.Chrome(options=opt)
   driver.get(url)

   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 400);")
   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 1200);")
   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 3000);")
   time.sleep(1)

   page_html = driver.page_source
   data = soup(page_html,'html.parser')

   allproduct = data.findAll('div',{'class':'c16H9d'})
   product_title = allproduct[0].text
   product_url = 'https:'+ allproduct[0].a['href']

   list_title = []
   list_url = []
   list_price = []
   list_image = []

   for pd in allproduct:
    pd_title = pd.text
    pd_url = 'https:' + pd.a['href']
    list_title.append(pd_title)
    list_url.append('https:' + pd.a['href'])


   allprice = data.findAll('span',{'class':'c13VH6'})
   for pc in allprice:
    pc_price = pc.text
    pc_price = pc_price.replace('฿','')
    pc_price = pc_price.replace(',','') 
    list_price.append(float(pc_price))


   allimages = data.findAll('img',{'class':'c1ZEkM'})
   for productimages in allimages:
    productimages_url = productimages['src']
    list_image.append(productimages_url)


   print(list_title)
   print(list_url)
   print(pc_price)
   print(list_image)


   driver.close()


   return(list_title,list_price,list_url,list_image)


base_url = "https://www.lazada.co.th/shop-smart-tv?pages="

n = 2
for i in range(1, n+1):
  response = base_url + "%d" %i
  url = response
  print (url)
  checkproduct(url)
  print ('_________________________')



laptop = checkproduct(url)
excelfile = Workbook()
row = excelfile.active
header = ['Product','Price','URL','Images']
row.append(header)


for i,j,k,l in zip(laptop[0],laptop[1],laptop[2],laptop[3]):
  row.append([i,j,k,l])


  #row['A45'] = 'ถูกสุด'
  #row['A46'] = 'แพงสุด'

  #min_price = min(laptop[1])
  #find = laptop[1].index(min_price)

  #row['B45'] = laptop[0][find]
  #row['C45'] = laptop[1][find]
  #row['D45'] = laptop[2][find]

  #max_price = max(laptop[1])
  #find = laptop[1].index(max_price)

  #row['B46'] = laptop[0][find]
  #row['C46'] = laptop[1][find]
  #row['D46'] = laptop[2][find]
excelfile.save('Lazada_Product2.xlsx')
print('Done')

In this loop it only extracts one set of data into an Excel file, what do I need to do to be able to extract more? Or extract more than 1 page

2
  • Can you be more specific about the issue? You've just dumped your entire program. Commented Feb 8, 2020 at 17:33
  • If your matter is solved please mark the answer as accepted so that others can see that your question has been answered. Commented Feb 9, 2020 at 7:02

2 Answers 2

3

The problem is that you don't properly return the results from your function. And your indentation seems broken.

Try this simplified and cleaned code that works fine and saves several hundred entries to an Excel sheet:

from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd

def checkproduct(url):

    driver = webdriver.Chrome()
    driver.get(url)

    driver.execute_script("window.scrollTo(0, 3000);")
    time.sleep(10) 

    page_html = driver.page_source
    data = soup(page_html, 'html.parser')

    allproduct = data.findAll('div', {'class':'c16H9d'})
    product_title = allproduct[0].text
    product_url = 'https:'+ allproduct[0].a['href']

    list_title = []
    list_url = []
    list_price = []
    list_image = []

    for pd in allproduct:
        pd_title = pd.text
        pd_url = 'https:' + pd.a['href']
        list_title.append(pd_title)
        list_url.append('https:' + pd.a['href'])

    allprice = data.findAll('span',{'class':'c13VH6'})

    for pc in allprice:
        pc_price = pc.text
        pc_price = pc_price.replace('฿','')
        pc_price = pc_price.replace(',','') 
        list_price.append(float(pc_price))

    allimages = data.findAll('img',{'class':'c1ZEkM'})

    for productimages in allimages:
        productimages_url = productimages['src']
        list_image.append(productimages_url)

    driver.close()

    return([list_title, list_price, list_url, list_image])

base_url = "https://www.lazada.co.th/shop-smart-tv?pages="

n = 3
rows = []

for i in range(1, n+1):
    response = base_url + f"{i}"
    url = response
    print (url)
    # you need to save the returned values from your function!
    results = checkproduct(url)
    rows.append(pd.DataFrame(results).T)

df = pd.concat(rows).reset_index(drop=True)
df.columns = ['Product','Price','URL','Images']
df.to_excel("Lazada_Product.xlsx")

Be aware that I use a Pandas dataframe for easy data manipulation and saving.

Sign up to request clarification or add additional context in comments.

4 Comments

Thank, Patezuich you have me a lot. But after trying to run. I found that the information it received was like the same page. Do you have any solutions?
You're welcome. And are you sure? The script saves all scraped data from all pages to one single Excel sheet.
Sure, This script can't scrape to other pages, in Excel File on save 1 pages loop. Do you have any solutions?
I just checked again and can't confirm that. The script scrapes exactly 40 samples per page. If I set n=1 I get 40 samples. If I set n=2 I get 80 samples and so on. Again: Please be aware that all data is saved to only one single Excel sheet. It's not 1 sheet per webpage. But again: You get all the data from the pages.
0

This is code to fetch data from website & export data in to Excel with python simple code.

You may need to install required dependency's with pip command.

pip install requests
pip install bs4
pip install selenium
pip install pandas
pip install openpyxl
pip install xlsxwriter

Any help required with this code you can connect with me over mail

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
table = soup.find_all('table')[0] # You can set index for table 0 or 1 or 2 as in webpage there are total 3 tables & having same classname so far.
#print(soup)

world_titles = table.find_all('th')

word_table_titles = [title.text.strip() for title in world_titles]
#print(word_table_titles)

df = pd.DataFrame(columns = word_table_titles)

column_data = table.find_all('tr')

for row in column_data[1:]:
    row_data = row.find_all('td')
    individualRowData = [data.text.strip() for data in row_data]
    length = len(df)
    df.loc[length] = individualRowData
    #print(individualRowData)

from datetime import datetime
#current_working_directory = os.getcwd()
#print(current_working_directory)
#df.to_xlsx(r'/storage/emulated/0/Python Programming',index = False)
filename = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
with pd.ExcelWriter( filename + ' Output.xlsx') as writer:
    df.to_excel(writer, index = False)

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.