Looping through keywords and multiple pages in Selenium

Question

This is a follow-up of my question over here.

I adopted @Reinderien's suggested script to a second website below:

fudan.py

from contextlib import contextmanager
from dataclasses import dataclass
from datetime import datetime, date
from pathlib import Path
from typing import Iterable, Optional, ContextManager

# pip install proxy.py
import proxy
from proxy.http.exception import HttpRequestRejected
from proxy.http.parser import HttpParser
from proxy.http.proxy import HttpProxyBasePlugin
from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
    TimeoutException,
    WebDriverException,
)
from selenium.webdriver import Firefox, FirefoxProfile
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


@dataclass
class PrimaryResult:
    caption: str
    date: date
    link: str

    @classmethod
    def from_row(cls, row: WebElement) -> 'PrimaryResult': 
        sno, caption, viewed_number, published_date = row.find_elements_by_xpath('td')

        caption_links = caption.find_elements_by_tag_name('a')[0]

        published_date = date.isoformat(datetime.strptime(published_date.text, '%Y/%m/%d'))

        return cls(
            caption = caption_links.text,
            date = published_date,
            link = caption_links.get_attribute('href')
        )

    def __str__(self):
        return (
            f'題名      {self.caption}'
            f'\n發表時間  {self.date}'
            f'\n文章連結　{self.link}'
        )


class MainPage:
    def __init__(self, driver: WebDriver):
        self.driver = driver

    def submit_search(self, keyword: str) -> None:
        wait = WebDriverWait(self.driver, 100)
        search = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'text2'))
        )
        search.send_keys(keyword)
        search.submit()

    def get_element_and_stop_page(self, *locator) -> WebElement:
        ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
        wait = WebDriverWait(self.driver, 30, ignored_exceptions=ignored_exceptions)
        elm = wait.until(EC.presence_of_element_located(locator))
        self.driver.execute_script("window.stop();")
        return elm

    def next_page(self) -> None:
        try: 
            link = self.get_element_and_stop_page(By.LINK_TEXT, "下页")
        except:
            print("No button with 「下页」 found.")
            return

        try:
            link.click()
            print("Navigating to Next Page")
        except (TimeoutException, WebDriverException):
            print("Last page reached")


class SearchResults:
    def __init__(self, driver: WebDriver):
        self.driver = driver

    def get_structured_elements(self) -> Iterable[PrimaryResult]:
        rows = self.driver.find_elements_by_xpath(
            '//table[1]/tbody/tr[position() > 1]'
        )

        for row in rows:
            yield PrimaryResult.from_row(row)


# class ContentFilterPlugin(HttpProxyBasePlugin):
#     HOST_WHITELIST = {
#         b'ocsp.digicert.com',
#         b'ocsp.sca1b.amazontrust.com',
#         b'big5.oversea.cnki.net',
#         b'gwz.fudan.edu.cn'
#     }

#     def handle_client_request(self, request: HttpParser) -> Optional[HttpParser]:
#         host = request.host or request.header(b'Host')
#         if host not in self.HOST_WHITELIST:
#             raise HttpRequestRejected(403)

#         if any(
#             suffix in request.path
#             for suffix in (
#                 b'png', b'ico', b'jpg', b'gif', b'css',
#             )
#         ):
#             raise HttpRequestRejected(403)

#         return request

#     def before_upstream_connection(self, request):
#         return super().before_upstream_connection(request)
#     def handle_upstream_chunk(self, chunk):
#         return super().handle_upstream_chunk(chunk)
#     def on_upstream_connection_close(self):
#         pass


# @contextmanager
# def run_driver() -> ContextManager[WebDriver]:
#     prox_type = ProxyType.MANUAL['ff_value']
#     prox_host = '127.0.0.1'
#     prox_port = 8889

#     profile = FirefoxProfile()
#     profile.set_preference('network.proxy.type', prox_type)
#     profile.set_preference('network.proxy.http', prox_host)
#     profile.set_preference('network.proxy.ssl', prox_host)
#     profile.set_preference('network.proxy.http_port', prox_port)
#     profile.set_preference('network.proxy.ssl_port', prox_port)
#     profile.update_preferences()

#     plugin = f'{Path(__file__).stem}.{ContentFilterPlugin.__name__}'

#     with proxy.start((
#         '--hostname', prox_host,
#         '--port', str(prox_port),
#         '--plugins', plugin,
#     )), Firefox(profile) as driver:
#         yield driver


def fudan_search(keyword) -> None:
    with Firefox() as driver:
        driver.get('http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword + '&btnSearch=')
        # driver.get('http://www.gwz.fudan.edu.cn')

        page = MainPage(driver)
        # page.submit_search(keyword)

        primary_result_page = SearchResults(driver)
        primary_results = primary_result_page.get_structured_elements()
        for result in primary_results:
            print(result)
            print()

        page.next_page()


if __name__ == '__main__':
    fudan_search('人性論')

Output:

題名      梅廣：《大學》古本新訂
發表時間  2017-06-12
文章連結　http://www.gwz.fudan.edu.cn/Web/Show/3063

題名      《楚地簡帛思想研究》（第四輯）出版
發表時間  2011-04-28
文章連結　http://www.gwz.fudan.edu.cn/Web/Show/1481

題名      譚樸森先生捐贈圖書總目
發表時間  2008-06-02
文章連結　http://www.gwz.fudan.edu.cn/Web/Show/448

題名      裘錫圭：由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章
發表時間  2008-01-27
文章連結　http://www.gwz.fudan.edu.cn/Web/Show/326

No button with 「下页」 found.

I couldn't get proxy to work over here because the page (ironically) took ages to load when using it.

In my old script below, I have a search function that loops through a list of search terms and compile them, page by page, into a captions_link dictionary. The url links in that dictionary is then fed into driver.get requests to scrape the author, title, and download link of the article, if available. Because not all captions lead to academic articles (and there is not way to tell programmatically beforehand), I can only visit all links and see whether there is anything to download with the loop_through_url function.

fudan_old.py

import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date, datetime
import json

# CONSTANTS

XPATH = {
    "captions": '//tbody/tr/td[2]/a',
    "date_published": '//tbody/tr[position() > 1]/td[4]',
    "max_page_num": "//table[2]/tbody/tr/td[1]",
    "downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}

# Initialize driver

def driver_init(keyword):
    global driver

    url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword

    options = Options()
    options.page_load_strategy = 'eager'
    driver = webdriver.Firefox(options=options)

    try:
        driver.get(url)
    except:
        driver.refresh()

    return driver

def stop_loading_page_when_element_is_present(xpath):
    global driver

    wait = WebDriverWait(driver, 100)
    wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
    driver.execute_script("window.stop();")

def turn_page():
    global driver

    ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
    wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)

    try:
        wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
        driver.execute_script("window.stop();")
    except:
        print('No button with 「下页」 found.')
        return

    try:
        wait.until(
            EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
        driver.find_element_by_link_text("下页").click()
        print("Navigating to Next Page")
    except (TimeoutException, WebDriverException):
        print("Last page reached")


def max_page_num():
    global driver

    elem = driver.find_element_by_xpath(XPATH['max_page_num'])
    text = elem.text
    max_pg = re.search("共.+?条记录， 页.+?/(.+)", text).group(1).strip()

    return int(max_pg)

def captions_dict():
    global driver

    ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
    wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)

    captions = []
    links = []
    dates = []

    for i in range(max_page_num()):

        content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
        time.sleep(3)
        stop_loading_page_when_element_is_present(XPATH['captions'])

        for item in content:
            captions.append(item.text)
            links.append(item.get_attribute('href'))

        date_published = driver.find_elements_by_xpath(XPATH['date_published'])

        for item in date_published:
            dates.append(item.text)

        turn_page()

    # convert to dictionary to remove duplicated captions.
    caption_link = dict(zip(captions, links))

    driver.close()

    return caption_link, dates


def get_article():
    global driver

    try:
        caption = driver.find_element_by_class_name('title')
        author, title = caption.text.split("：")
        stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
    except:
        return

    # dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
    # dl = dl.find_element_by_tag_name('a')
    dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")

    download_link = unquote(dl.get_attribute('href'))
    if download_link:
        print("Article found!")

    if author == "網摘":
        author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
        title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text

    rslt = {"author": author, "title": title, "url": download_link}

    return rslt

def loop_through_url(dict_with_url_as_values):

    keys_lst = list(dict_with_url_as_values.keys())
    url_lst = list(dict_with_url_as_values.values())

    downloadables = {}
    for i, item in enumerate(url_lst):
        global driver
        driver = webdriver.Firefox()

        try:
            driver.get(item)
        except:
            driver.refresh()

        stop_loading_page_when_element_is_present\
                ("/html/body/div[2]/div[2]/div/div[2]/span")

        print("Visiting ", keys_lst[i])
        result = get_article()
        if result:
            if len(result) > 1:
                downloadables.update({keys_lst[i]: result})
        driver.close()

    return downloadables

def search(keyword, output_format="json"):
    """Loop through list of search terms and 
    compile search results together."""
    
    global driver

    search_results = []
    not_found = []

    if isinstance(keyword, list):

        print("Searching through a list of", len(keyword), "keywords...\n")
        # items=list[map(lambda x: title_search(x), keyword)]
        for i, item in enumerate(keyword):
            single_search_result=search(item)
            if single_search_result:
                search_results.extend(single_search_result)
                print(i + 1, item)
            else:
                not_found.append(item)

        print("\n", len(not_found)," titles cannot be found:\n")
        print(*not_found, sep='\n')

        return search_results, not_found

    else:

        driver_init(keyword)
        stop_loading_page_when_element_is_present(XPATH['captions'])

        if output_format == "json":
            single_search_result, dates = captions_dict()
#        elif output_format == "bib": # ignore for now.
#            single_search_result = add_to_bib()
#        elif output_format == "zot":
#            single_search_result = add_to_zotero()
        else:
            print("Invalid output format.")

#        driver.close()

        return single_search_result, dates


def main(keyword):

    caption_link = search(keyword)

    rslt = loop_through_url(caption_link[0])
    dates = caption_link[1]

    for i, k in enumerate(list(rslt.keys())):
        rslt[k]['date']=date.isoformat(
          datetime.strptime(dates[i],'%Y/%m/%d'))

    with open('fudan_search_result.json', 'w') as file:
        file.write(str(date.today()))
        file.write("\n")
        json.dump(rslt, file, ensure_ascii=False, indent=4)

    print('Done!')

    return caption_link, rslt


if __name__ == '__main__':
   main('人性論')

Output:

{
    "梅廣：《大學》古本新訂": {
        "author": "梅廣",
        "title": "《大學》古本新訂",
        "url": "http://www.gwz.fudan.edu.cn/lunwen/1796梅廣：《大學》古本新訂.doc",
        "date": "2017-06-12"
    },
    "裘錫圭：由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章": {
        "author": "裘錫圭",
        "title": "由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章",
        "url": "http://www.gwz.fudan.edu.cn/Web/Show/articles/up/0059由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章.doc",
        "date": "2011-04-28"
    }

Questions:

How do we improve the old script's by incorporating its search-loop functionality into the new script above?
A demonstration of the Requests approach, if more suited to the task, will also be welcome!
One issue with the old script is that date needs to be put back into the dictionary in main.

@Reinderien, I have removed the wuhan.py update to make the question more concise. You can ignore add_to_bib and add_to_zotero for now. The functions were inherited from my old cnki script, and I have yet to update them to work for the present situation. — Sati
– Sati, Commented Jun 16, 2021 at 7:24

Reinderien · Accepted Answer · 2021-06-22 05:00:20Z

I'm going to give a partial answer that ignores most of what you've done, because the approach is Selenium-based when that is really not needed here. Compared to some of your other questions, this website is very simple, DOM-wise - and follows a very simple and standard pagination strategy. The biggest take-away here is - when you're scraping, stay as close to the metal as you reasonably can; if you can jettison the entire browser and use direct HTTP requests then you're bound to have an easier and faster time.

I am not going to attempt to write or re-write your visit loop. I recommend that you attempt this yourself, using Requests.

Suggested

from dataclasses import dataclass
from itertools import count
from typing import Dict, Iterable, Tuple, List

from bs4 import BeautifulSoup
from requests import Session
from datetime import date, datetime


@dataclass
class Link:
    title: str
    url: str
    clicks: int
    replies: int
    added: date

    @classmethod
    def from_row(cls, props: Dict[str, str], url: str) -> 'Link':
        clicks, replies = props['点击/回复'].split('/')
        # Skip number=int(props['编号']) - this only has meaning within one page

        return cls(
            title=props['资源标题'],
            url=url,
            clicks=int(clicks),
            replies=int(replies),
            added=datetime.strptime(props['添加时间'], '%Y/%m/%d').date(),
        )

    def __str__(self):
        return f'{self.added.isoformat()} {self.url} {self.title}'


def get_page(session: Session, query: str, page: int) -> Tuple[List[Link], int]:
    with session.get(
        'http://www.gwz.fudan.edu.cn/Web/Search',
        params={
            's': query,
            'page': page,
        },
    ) as resp:
        resp.raise_for_status()
        doc = BeautifulSoup(resp.text, 'html.parser')

    table = doc.select_one('#tab table')
    heads = [h.text for h in table.select('tr.cap td')]
    links = []

    for row in table.find_all('tr', class_=''):
        cells = [td.text for td in row.find_all('td')]
        links.append(Link.from_row(
            props=dict(zip(heads, cells)),
            url=row.find('a')['href'],
        ))

    page_td = doc.select_one('#tab table:nth-child(2) td')
    n_pages = int(page_td.text.rsplit('/', 1)[1])

    return links, n_pages


def get_all_links(session: Session, query: str) -> Iterable[Link]:
    for page in count(1):
        links, n_pages = get_page(session, query, page)
        print(f'{page}/{n_pages}')
        yield from links
        if page >= n_pages:
            break


def main() -> None:
    with Session() as session:
        for link in get_all_links(session, '究'):
            print(link)


if __name__ == '__main__':
    main()

Output (truncated)

1/98
2021-06-10 /Web/Show/4798 劉釗：關於《孟子》一處詞語訓釋和理解的辨正
2021-06-09 /Web/Show/4797 中心劉釗教授、汪少華教授、陳劍教授分別當選  中國訓詁學研究會副會長、秘書長、理事
2021-06-05 /Web/Show/4796 復旦大學出土文獻與古文字研究中心2021屆研究生完成論文答辯
2021-06-04 /Web/Show/4795 贾连翔先生著《出土数字卦文献辑释》出版
2021-05-31 /Web/Show/4794 王寧：清華簡拾《四告》之四的缺簡問題
2021-05-29 /Web/Show/4793 汪少華、顏春峰先生點校《茶香室叢鈔》出版
2021-05-28 /Web/Show/4792 梁春勝先生著《六朝石刻叢考》出版
2021-05-27 /Web/Show/4791 李永康：京山蘇家壟與《定公五年》“稷”地考
2021-05-26 /Web/Show/4790 陶安先生著《嶽麓秦簡〈爲獄等狀四種〉釋文注釋》（修訂本）出版
2021-05-24 /Web/Show/4789 方稚松先生著《殷墟甲骨文五種外記事刻辭研究》出版
2021-05-23 /Web/Show/4788 我中心2019級博士生王茁獲國務院政府特殊津貼
2021-05-20 /Web/Show/4787 呂全義先生著《兩周基層地域性居民組織研究》出版
2021-05-20 /Web/Show/4786 李洪財：談談漢簡草字的考釋方法
2021-05-17 /Web/Show/4785 名和敏光��虎溪山漢簡“Ｘ日而憂置城Ｙ歲”考釋
2021-05-14 /Web/Show/4784 聞人軍：齊國六種量制之演變 －－兼論《隋書·律曆志》“古斛之制”
2021-05-09 /Web/Show/4783 吳麗婉：大維多利亞美術館藏一片卜甲再考釋
2021-04-30 /Web/Show/4782 李永康：論西周時期的“伯仲稱謂” ——兼論春秋時期的“子伯仲稱謂”
2021-04-28 /Web/Show/4781 任攀：尹灣漢簡《神烏賦》校釋
2021-04-26 /Web/Show/4780  “第十二屆中古漢語國際學術研討會”徵集論文通知
2021-04-24 /Web/Show/4779 聞人軍：周代射侯形制新考
2021-04-23 /Web/Show/4778 裘錫圭：【囦丮】卣銘文補釋
2021-04-22 /Web/Show/4776 李愛輝：國圖藏甲骨殘片補考
2021-04-18 /Web/Show/4775 《半部學術史，一位李先生：李學勤先生學術成就與學術思想國際研討會論文集》出版
2021-04-18 /Web/Show/4774 《李學勤文集》發佈會在清華大學召開
2021-04-13 /Web/Show/4773 彭裕商先生著《漢語古文字學概論》出版
2021-04-12 /Web/Show/4772 湯志彪先生著《晉系璽印彙編》出版
2021-04-10 /Web/Show/4771 白一平、沙加爾先生著《上古漢語新構擬》出版
2021-04-09 /Web/Show/4770 陳英傑先生著《金文與青銅器研究論集》出版
2021-04-08 /Web/Show/4769 馬孟龍先生著《西漢侯國地理》（修訂本）出版
2021-04-02 /Web/Show/4768 我中心裘錫圭教授著《老子今研》出版
2/98
2021-03-19 /Web/Show/4767 迪迦：說楚地傳說中“穴熊”與“鬻熊”
2021-03-18 /Web/Show/4766 抱小：海昏竹書《詩》校字一則
2021-03-05 /Web/Show/4765 抱小：海昏竹書《詩》異文小札續
2021-03-02 /Web/Show/4764 陳劍、龐琨：選擇學術就是選擇一種生活方式——專訪陳劍教授
2021-02-25 /Web/Show/4762 劉海宇：據清華簡《四告》談《師同鼎》銘文首句的釋讀
2021-02-15 /Web/Show/4761 王寧：海昏侯墓竹簡《易占》淺識
2021-02-08 /Web/Show/4760 王冰：“虢季爲匽姬媵甗”乃姬姓間��婚證據辨正
2021-02-02 /Web/Show/4759 劉釗：出土文獻與《山海經》新證
2021-01-30 /Web/Show/4758 王寧：清華簡拾《四告》之二讀札
2021-01-26 /Web/Show/4757 抱小：釋海昏木楬之“㧼”字
2021-01-25 /Web/Show/4756 抱小：古文獻中所見從“勺”從“夕”之字訛誤之例
2021-01-25 /Web/Show/4755 莊晗：《考工記》“皋陶”新釋
2021-01-21 /Web/Show/4754 抱小：海昏竹書《保傅》“知=非色”臆解
2021-01-21 /Web/Show/4753 抱小：《尸子》補校一則
2021-01-20 /Web/Show/4752 抱小：海昏竹書《詩》異文小札
2021-01-18 /Web/Show/4751 王寧：清華簡拾《四告》之三讀札二則
2021-01-17 /Web/Show/4750 出土文獻與古文字研究青年學者訪談070：蕭毅
2021-01-13 /Web/Show/4748  朱鳳瀚、蘇强先生主編《中國國家博物館館藏文物研究叢書·青銅器卷（商）》出版
2021-01-12 /Web/Show/4747 出土文獻與古文字研究青年學者訪談069：禤健聰
2021-01-09 /Web/Show/4745 出土文獻與古文字研究青年學者訪談068：马孟龙
2021-01-06 /Web/Show/4744 出土文獻與古文字研究青年學者訪談067：方勇
2021-01-05 /Web/Show/4743 董珊：樂從堂藏銅馬式考
2020-12-31 /Web/Show/4740 《出土文獻與古文字研究》第九輯出版
2020-12-30 /Web/Show/4739 出土文獻與古文字研究青年學者訪談066：門藝
2020-12-29 /Web/Show/4738 李豪：上博簡“羹”字補釋
2020-12-26 /Web/Show/4737 清華大學王子楊教授來我中心作講座（“出土文獻與古文字研究雲講座”第九場）
2020-12-26 /Web/Show/4736 出土文獻與古文字研究青年學者訪談065：王化平
2020-12-26 /Web/Show/4735 李永康：春秋曾侯夫婦墓的認定與曾公求“至于桓莊”考
2020-12-25 /Web/Show/4734 我中心刘钊教授主编《马王堆汉墓简帛文字全编》获评2020年度中华书局双十佳图书
2020-12-24 /Web/Show/4733 出土文獻與古文字研究青年學者訪談064：吴毅强

Stack Exchange Network

Looping through keywords and multiple pages in Selenium

fudan.py

Output:

fudan_old.py

Output:

Questions:

1 Answer 1

Suggested

Output (truncated)

You must log in to answer this question.

Linked

Hot Network Questions

Looping through keywords and multiple pages in Selenium

fudan.py

Output:

fudan_old.py

Output:

Questions:

1 Answer 1

Suggested

Output (truncated)

You must log in to answer this question.

Linked

Related

Hot Network Questions