I have a data frame that has several columns, one of which contains html objects (containing tables). I want a column of table arrays.
My problem is that this piece of code takes a long time to run. Is there any way I can optimize this? I tried list comprehension, which doesn't significantly improve run time.
Some suggested that I restructure the logic. Any suggestions how?
df = htmldf
countries = dict(countries_for_language('en'))
countrylist = list(countries.values())
arrayoftableswithcountry = []
arrayofhtmltables = []
for idx, row in df.iterrows():
#print("We are now at row ", idx+1, "of", len(df),".")
inner_html = tostring(row['html'])
soup = bs(inner_html,'lxml')
tableswithcountry = []
outputr = []
for idex,item in enumerate(soup.select('table')):
#print("Extracting", idex+1, "of", len(soup.select('table')),".")
table = soup.select('table')[idex]
rows = table.find_all('tr')
output = []
outputrows = []
for row in rows:
cols = row.find_all('td')
cols = [item.text.strip() for item in cols]
output.append([item for item in cols if item])
if methodsname == 'revseg_geo':
if '$' in str(output):
for country in countrylist:
if country in str(output):
tableswithcountry.append(output)
outputr.append(table)
arrayoftableswithcountry.append(tableswithcountry)
arrayofhtmltables.append(outputr)
df['arrayoftables'] = arrayoftableswithcountry
df['arrayofhtmltables'] = arrayofhtmltables
print('Made array of tables.')
df.drop(columns=['html'])