Revisions to Python phantomjs loading webpage not correct

edited body

Source Link

edited Jul 11, 2017 at 17:13

Napmi

541
2
14
34

I have an issue where extracting from this link

http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250

brings me data from this link instead which is the main page itself. http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all

Any idea why is this occuring ? I am using PhantomJS selenium and beautiful soup to assit me in this.

# The standard library modules
import os
import sys
import re
import sqlite3
import locale
# The wget module
import wget
import time
import calendar
from datetime import datetime
# The BeautifulSoup module
from bs4 import BeautifulSoup

# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

    print soup

link ='http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250'
getURLS(link)

Solution from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")
    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
    driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()

 
    #Get text andsrc split= itdriver.page_source
    soup = BeautifulSoup(src, 'html5lib')
 
    link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
    getURLS(link)

I have an issue where extracting from this link

http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250

brings me data from this link instead which is the main page itself. http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all

Any idea why is this occuring ? I am using PhantomJS selenium and beautiful soup to assit me in this.

# The standard library modules
import os
import sys
import re
import sqlite3
import locale
# The wget module
import wget
import time
import calendar
from datetime import datetime
# The BeautifulSoup module
from bs4 import BeautifulSoup

# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

    print soup

link ='http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250'
getURLS(link)

Solution from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")
    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
    driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()

 
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')
 
link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)

I have an issue where extracting from this link

http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250

brings me data from this link instead which is the main page itself. http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all

Any idea why is this occuring ? I am using PhantomJS selenium and beautiful soup to assit me in this.

# The standard library modules
import os
import sys
import re
import sqlite3
import locale
# The wget module
import wget
import time
import calendar
from datetime import datetime
# The BeautifulSoup module
from bs4 import BeautifulSoup

# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

    print soup

link ='http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250'
getURLS(link)

Solution from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")
    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
    driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()
    src = driver.page_source
    soup = BeautifulSoup(src, 'html5lib')
    link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
    getURLS(link)

deleted 4 characters in body

Source Link

edited Jul 11, 2017 at 9:01

Napmi

541
2
14
34

I have an issue where extracting from this link

http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250

brings me data from this link instead which is the main page itself. http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all

Any idea why is this occuring ? I am using PhantomJS selenium and beautiful soup to assit me in this.

# The standard library modules
import os
import sys
import re
import sqlite3
import locale
# The wget module
import wget
import time
import calendar
from datetime import datetime
# The BeautifulSoup module
from bs4 import BeautifulSoup

# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

    print soup

link ='http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250'
getURLS(link)

Solution from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")
 
    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
    driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()

    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)

I have an issue where extracting from this link

http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250

brings me data from this link instead which is the main page itself. http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all

Any idea why is this occuring ? I am using PhantomJS selenium and beautiful soup to assit me in this.

# The standard library modules
import os
import sys
import re
import sqlite3
import locale
# The wget module
import wget
import time
import calendar
from datetime import datetime
# The BeautifulSoup module
from bs4 import BeautifulSoup

# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

    print soup

link ='http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250'
getURLS(link)

Solution from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")
 
    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
    driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()

    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)

I have an issue where extracting from this link

http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250

brings me data from this link instead which is the main page itself. http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all

Any idea why is this occuring ? I am using PhantomJS selenium and beautiful soup to assit me in this.

# The standard library modules
import os
import sys
import re
import sqlite3
import locale
# The wget module
import wget
import time
import calendar
from datetime import datetime
# The BeautifulSoup module
from bs4 import BeautifulSoup

# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

    print soup

link ='http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=5250'
getURLS(link)

Solution from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    src = driver.page_source
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")
    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
    driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()


    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)

found out how to load

Source Link

edited Jul 11, 2017 at 8:42

Napmi

541
2
14
34

new code from Alex LucaciSolution from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")

    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
 
    driver.getfind_element_by_xpath(url'//*[@id="bm_company_announcements_search_form"]/input[1]') # load the web page.click()

    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)

i received this error

> Traceback (most recent call last):   File "extractLinks.py", line 43,
> in <module>
>     getURLS(link)   File "extractLinks.py", line 25, in getURLS
>     category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
> File
> "/home/ec2-user/summaryMal/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py",
> line 313, in find_element_by_xpat                                     
> h
>     return self.find_element(by=By.XPATH, value=xpath)   File "/home/ec2-user/summaryMal/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py",
> line 791, in find_element
>     'value': value})['value']   File "/home/ec2-user/summaryMal/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py",
> line 256, in execute
>     self.error_handler.check_response(response)   File "/home/ec2-user/summaryMal/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/errorhandler.py",
> line 194, in check_response
>     raise exception_class(message, screen, stacktrace) selenium.common.exceptions.NoSuchElementException: Message:
> {"errorMessage":"Unable to find element with xpath
> '//*[@id=\"bm_announcement_types\"]                                   
> '","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"118","Content-Type":"appl
> ication/json;charset=UTF-8","Host":"127.0.0.1:40941","User-Agent":"Python
> http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\":
> \"xp                                                                  
> ath\", \"sessionId\": \"09369a20-6611-11e7-8a22-053025ba878c\",
> \"value\":
> \"//*[@id=\\\"bm_announcement_types\\\"]\"}","url":"/element","urlParse
> d":{"anchor":"","query":"","file":"element","directory":"/","path":"/element","relative":"/element","port":"","host":"","password":"","user":"","u
> serInfo":"","authority":"","protocol":"","source":"/element","queryKey":{},"chunks":["element"]},"urlOriginal":"/session/09369a20-6611-11e7-8a22-0
> 53025ba878c/element"}}

new code from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")

    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
 
    driver.get(url) # load the web page
    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)

i received this error

> Traceback (most recent call last):   File "extractLinks.py", line 43,
> in <module>
>     getURLS(link)   File "extractLinks.py", line 25, in getURLS
>     category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
> File
> "/home/ec2-user/summaryMal/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py",
> line 313, in find_element_by_xpat                                     
> h
>     return self.find_element(by=By.XPATH, value=xpath)   File "/home/ec2-user/summaryMal/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py",
> line 791, in find_element
>     'value': value})['value']   File "/home/ec2-user/summaryMal/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py",
> line 256, in execute
>     self.error_handler.check_response(response)   File "/home/ec2-user/summaryMal/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/errorhandler.py",
> line 194, in check_response
>     raise exception_class(message, screen, stacktrace) selenium.common.exceptions.NoSuchElementException: Message:
> {"errorMessage":"Unable to find element with xpath
> '//*[@id=\"bm_announcement_types\"]                                   
> '","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"118","Content-Type":"appl
> ication/json;charset=UTF-8","Host":"127.0.0.1:40941","User-Agent":"Python
> http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\":
> \"xp                                                                  
> ath\", \"sessionId\": \"09369a20-6611-11e7-8a22-053025ba878c\",
> \"value\":
> \"//*[@id=\\\"bm_announcement_types\\\"]\"}","url":"/element","urlParse
> d":{"anchor":"","query":"","file":"element","directory":"/","path":"/element","relative":"/element","port":"","host":"","password":"","user":"","u
> serInfo":"","authority":"","protocol":"","source":"/element","queryKey":{},"chunks":["element"]},"urlOriginal":"/session/09369a20-6611-11e7-8a22-0
> 53025ba878c/element"}}

Solution from Alex Lucaci

def getURLS(url):
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.get(url) # load the web page
    category_select = Select(driver.find_element_by_xpath('//*[@id="bm_announcement_types"]'))
    category_select.select_by_visible_text("Financial Results")
    category_select2 = Select(driver.find_element_by_xpath('//*[@id="bm_sub_announcement_types"]'))
    category_select2.select_by_visible_text("Financial Results")

    category_select3 = Select(driver.find_element_by_xpath('//*[@id="bm_company_list"]'))
    category_select3.select_by_visible_text("7-ELEVEN MALAYSIA HOLDINGS BERHAD (5250)")
    driver.find_element_by_xpath('//*[@id="bm_company_announcements_search_form"]/input[1]').click()

    src = driver.page_source
    #Get text and split it
    soup = BeautifulSoup(src, 'html5lib')

link="http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all"
getURLS(link)

testing out Alex Lucaci code

Source Link

edited Jul 11, 2017 at 8:17

Napmi

541
2
14
34

Loading

Source Link

asked Jul 7, 2017 at 7:16

Napmi

541
2
14
34

Loading

Collectives™ on Stack Overflow

Return to Question

Post Timeline